In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [None]:
import os
new_directory = "drive/MyDrive/02_Project/UIT_Fact_Checking_Contest/FNU/" # Change current directory into MainTraining in Google Drive
os.chdir(new_directory)
print("Current Working Directory:", os.getcwd())

In [None]:
import torch
from torch import nn as nn
import pandas as pd

from transformers import AutoModel, AutoTokenizer
from transformers import T5EncoderModel
from transformers import RobertaModel
from datasets import Dataset, DatasetDict

from sklearn.metrics import classification_report
from sklearn.metrics._classification import _check_targets

import matplotlib.pyplot as plt

import zipfile
import json
import datasets
import re

In [None]:
envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # Check again

In [None]:
def data_cleaning(text):
    text = re.sub(r"\n", "", text)
    text = text.strip()
    text = text[1:].strip() if text[0]=='.' else text
    text = text[:-1].strip() if text[-1]=='.' else text
    text = " ".join(text.split())
    return text

def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
    if CLEAN_DATA == True:
        premises = [data_cleaning(premise) for premise, _ in data['sample']]
        hypothesis = [data_cleaning(hypothesis) for _, hypothesis in data['sample']]
    else:
        premises = [premise for premise, _ in data['sample']]
        hypothesis = [hypothesis for _, hypothesis in data['sample']]

    with torch.no_grad():
        input_token = (tokenizer(premises, hypothesis, truncation=True, return_tensors="pt", padding = True)['input_ids']).to(envir)
        embedding = model(input_token).last_hidden_state

    mean_embedding = torch.mean(embedding[:, 1:, :], dim = 1)
    cls_embedding = embedding[:, 0, :]

    return {'mean':mean_embedding, 'cls':cls_embedding}

In [None]:
class NLI_model(nn.Module):
    def __init__(self, input_dims, class_weights=torch.tensor([0., 0., 0.])):
        super(NLI_model, self).__init__()

        self.classification = nn.Sequential(
            nn.Linear(input_dims, 3)
        )

        self.criterion = nn.CrossEntropyLoss(class_weights)

    def forward(self, input):
        output_linear = self.classification(input)
        return output_linear

    def training_step(self, train_batch, batch_idx=0):
        input_data, targets = train_batch
        outputs = self.forward(input_data)
        loss = self.criterion(outputs, targets)
        return loss

    def predict_step(self, batch, batch_idx=0):
        input_data, _ = batch
        outputs = self.forward(input_data)
        prob = outputs.softmax(dim = -1)
        sort_prob, sort_indices = torch.sort(-prob, 1)
        return sort_indices[:,0], sort_prob[:,0]

    def validation_step(self, val_batch, batch_idx=0):
        _, targets = val_batch
        sort_indices, _ = self.predict_step(val_batch, batch_idx)
        report = classification_report(list(targets.to('cpu').numpy()), list(sort_indices.to('cpu').numpy()), output_dict=True, zero_division = 1)
        return report

    def test_step(self, batch, dict_form, batch_idx=0):
        _, targets = batch
        sort_indices, _ = self.predict_step(batch, batch_idx)
        report = classification_report(targets.to('cpu').numpy(), sort_indices.to('cpu').numpy(), output_dict=dict_form, zero_division = 1)
        return report

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr = 1e-5)

In [None]:
input_folder = "Input"
file_name = "private_test_retrieval_v1_top5_top_5.json"

result_folder = "05_NLI"

if not(os.path.exists(result_folder) and os.path.isdir(result_folder)):
    os.mkdir(result_folder)
    print("Created directory:", result_folder)

BATCH_SIZE = 1
int2label = {0:'SUPPORTED', 1:'NEI', 2:'REFUTED'}

folder = "mDeBERTa (ft) V6"

if folder == 'mDeBERTa (ft) V6': CLEAN_DATA = True
else: CLEAN_DATA = False

In [None]:
dataframe = pd.read_json(f"{input_folder}/{file_name}")
test_set = {'sample':[], 'key':[], 'subkey':[]}
for key, values in dataframe.items():
    if len(values['evidence']) > 0:
        for subkey, evidence in enumerate(values['evidence']):
            test_set['sample'].append( (evidence, values['claim']) )
            test_set['key'].append(key)
            test_set['subkey'].append(subkey)
    else:
        raise Exception("Do not have evidence !!!")

dataset = DatasetDict({
    'test':Dataset.from_dict(test_set)
})

print(dataset)

In [None]:
def predict_mapping(batch):
    with torch.no_grad():
        predict_label, predict_prob = model.predict_step((batch[input_type].to(envir), None))
    return {'label':predict_label, 'prob':-predict_prob}

In [None]:
def output_predictedDataset(predict_dataset, origin_dataframe):
    data_frame = {}
    for key, value in origin_dataframe.items():
        data_frame[key] = {}
        for field, contend in value.items():
            data_frame[key][field] = contend
        data_frame[key]['verdict'] = ['' for i in range(len(data_frame[key]['evidence']))]
        data_frame[key]['prob'] = [1 for i in range(len(data_frame[key]['verdict']))]

    for record in predict_dataset:
        data_frame[ record['key'].item() ]['verdict'][ record['subkey'].item() ] = int2label[ record['label'].item() ]
        data_frame[ record['key'].item() ]['prob'][ record['subkey'].item() ] = record['prob'].item()
    return data_frame

In [None]:
for model_name in ["MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"]:
    merge_dataset = DatasetDict({"infer": datasets.concatenate_datasets([dataset['test']])})

    for input_type in ['cls', 'mean']:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(f"{folder}/{(model_name.split('/'))[-1]}-{input_type}").to(envir)
        merge_dataset = merge_dataset.map(mDeBERTa_tokenize, batched=True, batch_size=BATCH_SIZE)
        merge_dataset = merge_dataset.with_format("torch", [input_type, 'key', 'subkey'])

    # Load classifier model
        checkpoints = torch.load(f"{folder}/{input_type}.pt", map_location = envir)
        model = NLI_model(merge_dataset['infer'][input_type].shape[-1], torch.tensor([0., 0., 0.])).to(envir)
        model.load_state_dict(checkpoints['model_state_dict'])

    # Runing Inference step
        predicted_dataset = merge_dataset.map(predict_mapping, batched=True, batch_size=merge_dataset['infer'].num_rows)
    # Outputing the dataset
        new_dataframe = output_predictedDataset(predicted_dataset['infer'], dataframe)        

In [None]:
with open(f"{result_folder}/{file_name.replace('.json', '')}_{folder}({input_type}).json", 'w') as outfile:
    json.dump(new_dataframe, outfile)