## Import libraries

In [None]:
import random
import torch

done


In [None]:
!git clone https://github.com/Nazar1997/Sparse_vector.git

In [None]:
!git clone https://github.com/vladislareon/z_dna data

In [None]:
!git clone https://github.com/AIRI-Institute/GENA_LM.git

In [2]:
from torch.utils import data
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from collections import Counter
import pandas as pd
import numpy as np
import scipy
from tqdm import trange
from tqdm import tqdm
from datetime import datetime
import sys
import os
import seaborn as sns
from matplotlib import pyplot as plt
from joblib import Parallel, delayed, dump, load
from matplotlib import pyplot as plt
from Sparse_vector.sparse_vector import SparseVector
from scipy.signal import convolve2d, convolve
import time
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, f1_score
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [4]:
ASSEMBLY_d = {}
chroms_d = {}
all_features_d = {}
groups_d = {}
feature_names_d = {}
ZDNA_d = {}
black_list_d = {}
DNA_d = {}
DNA_features_d = {}

# HG38

In [None]:
ASSEMBLY = "ZDNA_2016"
chroms = [f'chr{i}' for i in list(range(1, 23)) + ['X', 'Y']]
all_features = sorted([i[:-4] for i in os.listdir('data/hg38_features/sparse/') if i.endswith('.pkl')])
groups = ['DNase-seq', 'Histone', 'RNA polymerase', 'TFs and others']
feature_names = [i for i in all_features if (i.split('_')[0] in groups)]
ZDNA = load('data/hg38_zdna/sparse/ZDNA_cousine.pkl')
black_list = load('blacklist_hg38_v2.pkl')

In [6]:
def chrom_reader(chrom):
    files = sorted([i for i in os.listdir(f'data/hg38_dna/') if f"{chrom}_" in i])
    return ''.join([load(f"data/hg38_dna/{file}") for file in files])


DNA = {chrom:chrom_reader(chrom) for chrom in tqdm(chroms)}

100%|██████████| 24/24 [00:11<00:00,  2.03it/s]


In [None]:
mode = 'hg38'
ASSEMBLY_d[mode] = ASSEMBLY
chroms_d[mode] = chroms
all_features_d[mode] = all_features
groups_d[mode] = groups
feature_names_d[mode] = feature_names
ZDNA_d[mode] = ZDNA
black_list_d[mode] = black_list
DNA_d[mode] = DNA

# Data part

In [None]:
mode = 'hg38'
ASSEMBLY = ASSEMBLY_d[mode]
chroms = chroms_d[mode]
all_features = all_features_d[mode]
groups = groups_d[mode]
feature_names = feature_names_d[mode]
ZDNA = ZDNA_d[mode]
black_list = black_list_d[mode]
DNA = DNA_d[mode]

In [None]:
width = 70

np.random.seed(10)

ints_in = []
ints_out = []


for chrm in chroms:
    for st in trange(0, ZDNA[chrm].shape - width, width):
        interval = [st, min(st + width, ZDNA[chrm].shape)]
        N_count = sum([bp == "N" for bp in DNA[chrm][interval[0]:interval[1]]])
        bl_count = black_list[chrm][interval[0]:interval[1]].sum()
        if N_count > width / 2 or bl_count > 0:
            continue
        else:
            if ZDNA[chrm][interval[0]: interval[1]].any():
                ints_in.append([chrm, int(interval[0]), int(interval[1]), 1])
            else:
                ints_out.append([chrm, int(interval[0]), int(interval[1]), 0])




print(len(ints_in))
print(len(ints_out))

ints_in_full = ints_in
ints_out_full = ints_out


100%|██████████| 3556520/3556520 [01:41<00:00, 34958.58it/s]
100%|██████████| 3459907/3459907 [01:40<00:00, 34343.67it/s]
100%|██████████| 2832793/2832793 [01:22<00:00, 34296.16it/s]
100%|██████████| 2717350/2717350 [01:21<00:00, 33444.92it/s]
100%|██████████| 2593403/2593403 [01:14<00:00, 34936.69it/s]
100%|██████████| 2440085/2440085 [01:12<00:00, 33878.58it/s]
100%|██████████| 2276371/2276371 [01:07<00:00, 33632.66it/s]
100%|██████████| 2073409/2073409 [00:59<00:00, 34713.51it/s]
100%|██████████| 1977067/1977067 [00:52<00:00, 37688.06it/s]
100%|██████████| 1911391/1911391 [00:57<00:00, 33176.07it/s]
100%|██████████| 1929808/1929808 [00:55<00:00, 34901.11it/s]
100%|██████████| 1903932/1903932 [00:55<00:00, 34428.97it/s]
100%|██████████| 1633776/1633776 [00:47<00:00, 34337.04it/s]
100%|██████████| 1529195/1529195 [00:40<00:00, 37551.51it/s]
100%|██████████| 1457016/1457016 [00:37<00:00, 38601.52it/s]
100%|██████████| 1290547/1290547 [00:35<00:00, 36410.65it/s]
100%|██████████| 1189392

49624
40396746





In [10]:
ints_in = ints_in_full
ints_out = [ints_out_full[i] for i in np.random.choice(range(len(ints_out_full)),
                                                    size=len(ints_in) * 3, replace=False)]
print(len(ints_in))
print(len(ints_out))

49624
148872


In [11]:
equalized = np.array(ints_in + ints_out)

In [12]:
divisions = list(StratifiedKFold(5, shuffle=True,
                                 random_state=42).split(equalized, [f"{elem[3]}_{elem[0]}"
                                         for i, elem
                                         in enumerate(equalized)]))

In [13]:
train_intervals, test_intervals = train_test_split(equalized, test_size=0.2, shuffle=True) # Basic train_test_split for now
train_intervals, val_intervals = train_test_split(train_intervals, test_size=0.25, shuffle=True)
len(train_intervals), len(val_intervals), len(test_intervals)

(119097, 39699, 39700)

In [14]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

## Загрузка модели

In [None]:
from GENA_LM.src.gena_lm.modeling_bert import BertForTokenClassification
from transformers import AutoTokenizer
import json
from torch.utils.data import Dataset

# tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t')
model = BertForTokenClassification.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t', num_labels=2)
# model = BertForTokenClassification.from_pretrained('gena-lm-bert-base-t2t', num_labels=2)
model.to(device)

In [None]:
from transformers import AutoTokenizer
# load tokenizer with a changed tokenizer.json file. All tokens are no longer than 5
tokenizer = AutoTokenizer.from_pretrained('tokenizer_folder')

In [18]:
vocab = tokenizer.get_vocab()

In [19]:
label2id = vocab
id2label = {id_ : label for label, id_ in label2id.items()}

In [None]:
class DNADataset(Dataset):
    def __init__(self, chroms,
                 dna_source,
                 labels, intervals, tokenizer, threshold=0.5):
        self.chroms = chroms
        self.dna_source = dna_source
        self.labels = labels
        self.intervals = intervals
        self.threshold = threshold # for labeling based on percentage

    def __len__(self):
        return len(self.intervals)

    def __getitem__(self, idx):
        interval = self.intervals[idx]
        chrom = interval[0]
        begin = int(interval[1])
        end = int(interval[2])
        sequence = self.dna_source[chrom][begin:end].upper()

        y = self.labels[interval[0]][int(interval[1]):int(interval[2])]
        labels_by_tokens = []
        X = tokenizer(sequence, truncation=True, padding=True)
        toks = X['input_ids']
        i = 0
        for token in toks:
            if '[' in id2label[token]:
                labels_by_tokens.append(-100)
            else:
                labels_tokens = y[i:i + len(id2label[token])]
                if labels_tokens.sum() / len(labels_tokens) > self.threshold:
                    labels_by_tokens.append(1)
                else:
                    labels_by_tokens.append(0)
                i += len(id2label[token])
        labels_by_tokens = torch.tensor(labels_by_tokens, dtype=torch.int64)
        X['labels'] = labels_by_tokens
        return X



In [None]:
# an example of sequence tokenization
res = tokenizer("AGTGAGCGCC", truncation=True, padding=True)
res

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [1, 87, 35, 101, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [None]:
thr = 0.7
val_dataset_dna = DNADataset(chroms, DNA, ZDNA, val_intervals, tokenizer, threshold=thr)
train_dataset_dna = DNADataset(chroms, DNA, ZDNA, train_intervals, tokenizer, threshold=thr)
test_dataset_dna = DNADataset(chroms, DNA, ZDNA, test_intervals, tokenizer, threshold=thr)

In [26]:
module_name = model.__class__.__module__
print(module_name)

GENA_LM.src.gena_lm.modeling_bert


In [27]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [28]:
import importlib

In [29]:
cls = getattr(importlib.import_module(module_name), 'BertForTokenClassification')
cls

GENA_LM.src.gena_lm.modeling_bert.BertForTokenClassification

In [30]:
model = cls.from_pretrained('gena-lm-bert-base-t2t')
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at gena-lm-bert-base-t2t and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (pre_attention_ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (post_attention_ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (so

In [31]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
import evaluate
mcc = evaluate.load("matthews_correlation")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")

done


## Current block is for LORA fine-tuning. If you don't want to use LORA, skip this block

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

done


In [None]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 693,506 || all params: 110,619,652 || trainable%: 0.6269


In [None]:
gena_module_name = model.__class__.__module__
print(gena_module_name)

## Training

In [None]:
batch_size = 32
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=2)
    
    p = []
    l = []
    for prs, lbls in zip(predictions, labels):
        for pr, lbl in zip(prs, lbls):
            if lbl != -100:
                p.append(pr)
                l.append(lbl)
                
    mcc_score = mcc.compute(predictions=p, references=l)["matthews_correlation"]
    f1_score = f1_metric.compute(predictions=p, references=l)["f1"]
    precision = precision_metric.compute(references=l, predictions=p)['precision']
    accuracy = accuracy_metric.compute(references=l, predictions=p)['accuracy']
    return {"Accuracy": accuracy, "MCC": mcc_score, "F1": f1_score, "Precision": precision}

args = TrainingArguments(
    output_dir="run_110_6",
    learning_rate=2e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_ratio=0.1,
    optim='adamw_torch',
    label_names=["labels"],
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    load_best_model_at_end=True,
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset_dna,
    eval_dataset=val_dataset_dna,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.can_return_loss = True
trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,F1,Precision
1,0.0279,0.037984,0.986232,0.879155,0.885869,0.856095
2,0.027,0.036614,0.986755,0.881853,0.888665,0.870133
3,0.0258,0.036815,0.986935,0.883768,0.890433,0.869942
4,0.0238,0.037764,0.986751,0.88114,0.888049,0.873914
5,0.0231,0.041673,0.986131,0.878224,0.885001,0.855456


TrainOutput(global_step=4655, training_loss=0.025496649614082115, metrics={'train_runtime': 1127.3053, 'train_samples_per_second': 528.238, 'train_steps_per_second': 4.129, 'total_flos': 7254613553995080.0, 'train_loss': 0.025496649614082115, 'epoch': 5.0})

In [None]:
torch.save(model.state_dict(), 'model_zdna_70_110_after_510_thr_0_3_0_6_07_max_5-10ep.pt')

In [42]:
trainer.evaluate(test_dataset_dna)

{'eval_loss': 0.02524162270128727,
 'eval_Accuracy': 0.9904477788957797,
 'eval_MCC': 0.881465697103664,
 'eval_F1': 0.8864378063063575,
 'eval_Precision': 0.881755457527429,
 'eval_runtime': 89.737,
 'eval_samples_per_second': 389.583,
 'eval_steps_per_second': 6.096,
 'epoch': 10.0}

## Interpretation

In [None]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, f1_score

done


In [111]:
len(test_dataset_dna)

34960

In [36]:
from tqdm import tqdm, trange

In [106]:
y_true = []
y_pred = []
vals1 = set()
vals2 = set()
not_equal_count = 0
accs = []
press = []
mccs = []
f1s = []
zdna_is = []
t = 0
for idx in trange(0, len(test_dataset_dna)):
    element = test_dataset_dna[idx]
    el = element
    element_torch = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in element.items()}
    token_ids = element['input_ids']
    tokens = [id2label[id_] for id_ in token_ids]
    interval = test_dataset_dna.intervals[idx]
    chrom = interval[0]
    begin = int(interval[1])
    end = int(interval[2])
    sequence = test_dataset_dna.dna_source[chrom][begin:end].upper()
    # print(element_torch)
    with torch.no_grad():
        logits = model(**element_torch).logits
    predictions = torch.argmax(logits.cpu(), dim=2)
    pred_nucleotide = []
    for i, pred in enumerate(predictions[0]):
        if -100 != int(element['labels'][i]):
            token_length = len(tokens[i])
            for _ in range(token_length):
                pred_nucleotide.append(int(pred))
    
    y = test_dataset_dna.labels[interval[0]][int(interval[1]):int(interval[2])]
    if len(y) != len(pred_nucleotide):
        print(len(y),len(pred_nucleotide))
        print("Y:", y)
        print("Pred:", pred_nucleotide)
        print("TOK:", ''.join(tokens))
        print("SEQ:", sequence)
        print("--------")
        not_equal_count += 1
    else:
        t += 1
        y_true.append(y)
        y_pred.append(pred_nucleotide)
    vals1.update(set(y))
    vals2.update(set(pred_nucleotide))
print("done")

100%|██████████| 39700/39700 [07:00<00:00, 94.36it/s] 

done





In [109]:
y_true_arr = np.array(y_true)
y_pred_arr = np.array(y_pred)
print(y_true_arr.shape, y_pred_arr.shape)

(39700, 70) (39700, 70)


Getting metrics for single-nucleotide prediction.

In [110]:
print("Accuracy:", accuracy_score(y_true_arr.flatten(), y_pred_arr.flatten()))
print("MCC:", matthews_corrcoef(y_true_arr.flatten(), y_pred_arr.flatten()))
print("Precision:", precision_score(y_true_arr.flatten(), y_pred_arr.flatten()))
print("F1:", f1_score(y_true_arr.flatten(), y_pred_arr.flatten()))

Accuracy: 0.984092479309104
MCC: 0.8551746175877273
Precision: 0.9024720006760139
F1: 0.862662130885254


Counting ratios.

In [103]:
tp = ((y_true_arr.flatten() == 1) & (y_pred_arr.flatten() == 1)).sum()
tn = ((y_true_arr.flatten() == 0) & (y_pred_arr.flatten() == 0)).sum()
fp = ((y_true_arr.flatten() == 0) & (y_pred_arr.flatten() == 1)).sum()
fn = ((y_true_arr.flatten() == 1) & (y_pred_arr.flatten() == 0)).sum()

# Calculate ratios
total = torch.tensor(len(y_true_arr.flatten()), dtype=torch.float)
ratios = {
    'TP': tp / total,
    'TN': tn / total,
    'FP': fp / total,
    'FN': fn / total
}


In [104]:
total

tensor(2778860.)

In [105]:
ratios

{'TP': tensor(0.0522),
 'TN': tensor(0.9311),
 'FP': tensor(0.0057),
 'FN': tensor(0.0110)}

## Intergrated Gradients interpretation

In [37]:
from collections import defaultdict

In [52]:
TP_counts = defaultdict(int)
total_meetings_counts = defaultdict(int)


for idx in trange(0, len(val_dataset_dna)):
    element = val_dataset_dna[idx]
    el = element
    element_torch = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in element.items()}
    token_ids = element['input_ids']
    tokens = [id2label[id_] for id_ in token_ids]
    interval = val_dataset_dna.intervals[idx]
    chrom = interval[0]
    begin = int(interval[1])
    end = int(interval[2])
    sequence = val_dataset_dna.dna_source[chrom][begin:end].upper()
    with torch.no_grad():
        logits = model(**element_torch).logits
    predictions = torch.argmax(logits, dim=2)
    pred_nucleotide = []
    y = val_dataset_dna.labels[interval[0]][int(interval[1]):int(interval[2])]

    curr_ind = 0
    for tok_ind, token in enumerate(tokens):
        label = element['labels'][tok_ind]
        
        pred = predictions[0][tok_ind]
        total_meetings_counts[token] += 1
        if pred == label and pred == 1:
            TP_counts[token] += 1
print("done")

100%|██████████| 34959/34959 [06:59<00:00, 83.36it/s]

done





In [53]:
percents = []
for key, val in total_meetings_counts.items():
    if total_meetings_counts[key] > 0:
        percents.append((TP_counts[key] / total_meetings_counts[key], key))

In [55]:
(sorted(percents)[::-1])[:10]

[(0.8230088495575221, 'GCGCG'),
 (0.6916588566073102, 'GCGC'),
 (0.6564885496183206, 'GCACG'),
 (0.6036866359447005, 'ACGCG'),
 (0.5462962962962963, 'GCGTG'),
 (0.5042321644498187, 'TGCGC'),
 (0.4842055634134842, 'ACGC'),
 (0.44750656167979, 'GTGC'),
 (0.4251497005988024, 'GTGTG'),
 (0.4222950819672131, 'GCG')]

In [57]:
df = pd.DataFrame(sorted(percents)[::-1])
df = df.rename(columns={0: 'Score', 1: 'Token'})
df.head()

Unnamed: 0,Score,Token
0,0.823009,GCGCG
1,0.691659,GCGC
2,0.656489,GCACG
3,0.603687,ACGCG
4,0.546296,GCGTG


In [58]:
df1 = pd.DataFrame(list(total_meetings_counts.items()))
df1 = df1.rename(columns={0: 'Token', 1: 'Counts'})
df1.head()

Unnamed: 0,Token,Counts
0,[CLS],34959
1,GGGGC,1190
2,GCCC,1953
3,ATG,14650
4,TGCGC,827


In [59]:
df_merged = df1.merge(df, on=['Token'], how='left')
df_merged.head()

Unnamed: 0,Token,Counts,Score
0,[CLS],34959,0.0
1,GGGGC,1190,0.171429
2,GCCC,1953,0.263697
3,ATG,14650,0.026212
4,TGCGC,827,0.504232


In [60]:
df_merged.sort_values('Score', ascending=False)

Unnamed: 0,Token,Counts,Score
287,GCGCG,339,0.823009
146,GCGC,3201,0.691659
197,GCACG,131,0.656489
369,ACGCG,217,0.603687
558,GCGTG,108,0.546296
...,...,...,...
429,AATGG,774,0.000000
433,GAACC,28,0.000000
436,GTACC,25,0.000000
438,CTTG,73,0.000000


In [61]:
df_merged.sort_values('Score', ascending=False).to_csv("gena-lm-token-importance-max-110-510-5-03-06.csv")

## Interpretation using integrated_gradients

In [41]:
from transformers_interpret import TokenClassificationExplainer

In [42]:
mce = TokenClassificationExplainer(model, tokenizer)

In [43]:
mce

<transformers_interpret.explainers.text.token_classification.TokenClassificationExplainer at 0x2b1eed5f73d0>

In [45]:
from collections import defaultdict

In [None]:
num_of_samples = 4000
test_indexes = np.random.choice(len(test_dataset_dna), num_of_samples, replace=False)

In [None]:
total_meetings_counts = defaultdict(int)
TP_counts = defaultdict(int)
positive_scores_sum = defaultdict(int)
for sample_idx in trange(0, len(test_indexes)):
    idx = test_indexes[sample_idx]
    element = test_dataset_dna[idx]
    el = element
    element_torch = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in element.items()}
    token_ids = element['input_ids']
    tokens = [id2label[id_] for id_ in token_ids]
    interval = test_dataset_dna.intervals[idx]
    chrom = interval[0]
    begin = int(interval[1])
    end = int(interval[2])
    sequence = test_dataset_dna.dna_source[chrom][begin:end].upper()
    with torch.no_grad():
        logits = model(**element_torch).logits
    predictions = torch.argmax(logits, dim=2)
    pred_nucleotide = []
    y = test_dataset_dna.labels[interval[0]][int(interval[1]):int(interval[2])]
    word_attributions = mce(sequence)
    for tok_ind, token in enumerate(tokens):
        label = element['labels'][tok_ind]
        if label != -100:
            pred = int(predictions[0][tok_ind])
            if label == pred and label == 1:
                # true positive
                tokens_watch = sorted(word_attributions[token]['attribution_scores'], key=lambda x: x[1])[::-1][:10]
                for helpful_token, helpful_token_score in tokens_watch:
                    total_meetings_counts[helpful_token] += 1
                    if pred == label and pred == 1:
                        TP_counts[helpful_token] += 1
                        positive_scores_sum[helpful_token] += helpful_token_score


In [48]:
integr_grad_result = []
for token, count in TP_counts.items():
    summa = positive_scores_sum[token]
    integr_grad_result.append((summa / count, token))
    

In [49]:
df_intr_gr = pd.DataFrame(sorted(integr_grad_result)[::-1])
df_intr_gr = df_intr_gr.rename(columns={0: 'Score', 1: 'Token'})
df_counts = pd.DataFrame(list(TP_counts.items()))
df_counts = df_counts.rename(columns={0: 'Token', 1: 'Count'})
df_intgr_merged = df_counts.merge(df_intr_gr, how='left', on='Token')
df_intgr_merged = df_intgr_merged.sort_values('Score', ascending=False)
df_intgr_merged.head()

Unnamed: 0,Token,Count,Score
382,AAATC,1,0.957605
198,GCGTG,28,0.803239
194,ACGCG,31,0.658981
167,GCGCG,99,0.647458
33,GCGC,982,0.558593


In [50]:
df_intgr_merged.to_csv('integrated_gradinents_scores_true_positive-5-3-stages.csv')