In [None]:
#this code to compute TP,FP,TN,FN produced by the classifier on all folds

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime
from tqdm.auto import tqdm
import math
import sys
import os
import json
import evaluate
import numpy as np
from simpletransformers.language_representation import RepresentationModel
from transformers import AutoTokenizer,AutoModelForQuestionAnswering,pipeline,AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, RandomSampler,DataLoader
from sklearn.metrics import classification_report
import general_util as gu

Test available GPUs

In [None]:
print(torch.cuda.device_count())
gpu_ids = [gpu_id for gpu_id in range(torch.cuda.device_count())]
print(gpu_ids)
device = torch.device(f'cuda:{gpu_ids[0]}')
torch.cuda.set_device(device)

In [None]:
#os.environ["CUDA_VISIBLE_DEVICES"]="1"
bert_mean_embedding = RepresentationModel(
            model_type="bert", model_name="bert-base-uncased",
            use_cuda=True)

In [None]:
class MLP(nn.Module):
    def __init__(self,hidden_size1, hidden_size2, output_dim, dropout,gamma,alpha):
        super().__init__()

        self.num_classes = output_dim
        self.gamma = gamma
        self.alpha = alpha
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size1, hidden_size2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size2, self.num_classes),
        )


    def forward(self, x,labels=None):

        logits = self.classifier(x)

        loss = None
        if labels is not None:
            # if labels is not None:
            assert self.num_classes == 2, f'Expected 2 labels but found {self.num_labels}'
            if self.alpha != None:
                alpha_tensor = torch.Tensor([self.alpha, 1 - self.alpha])
                loss_fct = F.cross_entropy(logits.view(-1, self.num_classes), labels.view(-1), weight=alpha_tensor)
            else:
                loss_fct = F.cross_entropy(logits.view(-1, self.num_classes), labels.view(-1))

            pt = torch.exp(-loss_fct)
            loss = (((1 - pt) ** self.gamma * loss_fct)+loss_fct)/2

        return loss,torch.sigmoid(logits)

In [None]:
def get_val_data(fold):
    dataset = gu.load_data(fold)

    val_data = dataset["validation"]#.select(range(10))

    contexts = []
    labels = []
    ids = []
    for context, label,id in zip(val_data['context'],val_data['label'],val_data["id"]):
        contexts.append(context)
        labels.append(label)
        ids.append(id)

    return contexts, labels,ids

In [None]:
def batch_iter(x,y,ids, batch_size, shuffle=False):
    batch_num = math.ceil(len(x) / batch_size)
    index_array = list(range(len(y)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        contexts = [x[idx] for idx in indices]
        labels = [y[idx] for idx in indices]
        identifiers = [ids[idx] for idx in indices]

        yield contexts, labels,identifiers

In [None]:
#this function receives (text,label,id) lists, get the bert embedding for the text and creates a dataloader for the text,label,id triple
def get_dataloader(x,y,id,batch_size):
    #get embedding
    x_train= bert_mean_embedding.encode_sentences(x,combine_strategy="mean")
    #convert to tensors
    x= torch.Tensor(x_train)
    y = torch.LongTensor(y)
    id = [int(z) for z in id]
    id = torch.LongTensor(id)
    #create dataloader
    tmp_DataSet = TensorDataset(x, y,id)
    tmp_Sampler = RandomSampler(tmp_DataSet)
    _DataLoader = DataLoader(tmp_DataSet, batch_size=batch_size, sampler=tmp_Sampler)
    
    return _DataLoader
    

In [None]:
def save_dict_to_txt_file(dic,file_name):    
    with open(f'{file_name}.txt', 'w') as convert_file:
        convert_file.write(json.dumps(dic))



In [None]:
def read_dict_from_txt_file(file_name):
   # reading the data from the file
    with open(f'{file_name}.txt') as f:
        data = f.read()
            
    # reconstructing the data as a dictionary
    js = json.loads(data)

    return eval(str(js))

In [None]:
def compute_statistics_and_return_cls_dictionary(all_ids,all_preds,all_labels,all_preds_probabilites):
    
    fold_classification_results = {}
    tp,tn,fp,fn = 0,0,0,0
    # id is the context id, p is the predicted label and l is the ground truth label
    for id,p,l,probs  in zip(all_ids,all_preds,all_labels,all_preds_probabilites):
        #add item to the result dictionary
        cur_id = id#.item()
        if str(cur_id) in fold_classification_results.keys():
            print(f"Context with id {cur_id} is duplicated\n")
        fold_classification_results[str(cur_id)] =(p,probs)
        
        if l==1 and p==1:
            tp+=1
        elif l==1 and p==0:
            fn+=1
        elif l==0 and p==1:
            fp+=1
        elif l==0 and p==0:
            tn+=1
    
    print(f"True Positive:{tp}\nTrue Negative: {tn}\nFalse Positive: {fp}\nFalse Negative: {fn}\n")
    return fold_classification_results
            
        

In [None]:
old_stdout = sys.stdout
sys.stdout= open("cls_preds_as_probs_wiht_bert.txt","w")

In [None]:
#apply the mlp-2 filter on the contexts
lr = 5e-5
batch_size = 100
dropout_keep_prob = 0.5
num_classes = 2

hidden_size1 = 768
hidden_size2 = 1024
num_epochs = 5


cls_model = MLP(hidden_size1, hidden_size2, num_classes, dropout_keep_prob,alpha=0.10,gamma=4)

for fold_id in [0,1,2,3,4]:
    
    print(f"\n********************fold{fold_id}*******************\n")
    #print(f"this file uses no_answer_threshold: {thresholds[fold_id]}")

    x_val,y_val,ids = get_val_data(fold_id)
    val_dataloader = get_dataloader(x_val,y_val,ids,batch_size)
    cls_model.load_state_dict(torch.load(f'fresh_filter/saved_weights_{fold_id}.pt'))
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_recall = 0
    epoch_f1 = 0
    batch_num = len(val_dataloader) # number of batches
    cls_model.eval()

    all_preds = []
    all_labels = []
    all_ids = []
    all_preds_probabilites = []
    with torch.no_grad():
        for step, batch in enumerate(tqdm(val_dataloader)):
            loss,predictions = cls_model(batch[0],batch[1])
            cls_output = torch.argmax(predictions,dim=1)
            out= predictions[:,0]

            all_labels.extend(batch[1])
            all_preds.extend(cls_output.tolist())
            all_preds_probabilites.extend(out.tolist())
            
            all_ids.extend(batch[2])
            

    print(classification_report(all_labels, all_preds))
    
    #compute fold statistics and get fold classification results
    cls_dict=compute_statistics_and_return_cls_dictionary(all_ids,all_preds,all_labels,all_preds_probabilites)
    
    #save fold calssification results
    save_dict_to_txt_file(cls_dict,f"fresh_filter/fold_cls_results_final/fold{fold_id}")
    
    #read_dict_to_txt_file(f"fresh_filter/fold_cls_results/fold{fold_id}")
    
    
    
    

In [None]:
#apply bert filter on the contexts
batch_size = 128
num_classes = 2
max_len = 500
global tokenizer
model_checkpoint = "best_acc_filter/"


for fold_id in [0,1,2,3,4]:
    
    print(f"\n********************fold{fold_id}*******************\n")
    #print(f"this file uses no_answer_threshold: {thresholds[fold_id]}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint+ "exp_"+str(fold_id))
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint+"exp_"+str(fold_id),num_labels=num_classes).to(device)
        
    model.eval()
    
    x_val,y_val,ids = get_val_data(fold_id)
    #val_dataloader = get_val_dataloader(x_val,y_val,ids,batch_size)
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_recall = 0
    epoch_f1 = 0
    #batch_num = len(val_dataloader) # number of batches

    all_preds = []
    all_labels = []
    all_ids = []
    all_preds_probabilites = []
    
    with torch.no_grad():
        for x,y,ids in batch_iter(x_val,y_val,ids, batch_size, shuffle=False):

            cur_x = tokenizer(x,truncation=True,padding=True,return_tensors="pt").to(device)
            cur_y= torch.LongTensor(y).to(device)
            
            outputs = model(**cur_x,labels=cur_y)
            #consider the logits
            logits = outputs.logits
            
            #transfer logits to probabilities
            probabilities = F.softmax(logits, dim=-1)
            
            #use probabilites to predict the label
            cls_output = torch.argmax(probabilities,dim=1)
            
            #consider the no-dataset probabilities for each context
            out= probabilities[:,0]

            all_labels.extend(y)
            all_preds.extend(cls_output.cpu().numpy().tolist())
            all_preds_probabilites.extend(out.cpu().numpy().tolist())
            
            all_ids.extend(ids)
   

    print(classification_report(all_labels, all_preds))
    
    #compute fold statistics and get fold classification results
    cls_dict=compute_statistics_and_return_cls_dictionary(all_ids,all_preds,all_labels,all_preds_probabilites)
    
    #save fold calssification results
    save_dict_to_txt_file(cls_dict,f"{model_checkpoint}result_fold{fold_id}")
    
    #read_dict_to_txt_file(f"fresh_filter/fold_cls_results/fold{fold_id}")