# Evidence Finding

# Install the required libraries

In [None]:
!pip install transformers sentencepiece
!pip install sentence-transformers

# Mount GDrive

In [None]:
# Mount Google drive to access the data
from google.colab import drive
drive.mount('/content/drive')

# Data:

Read the data from the csv file. The original file has 10.24M samples, but here we are using only some of them. 

In [None]:
import torch
# from torchtext.data import Field, TabularDataset, BucketIterator, Dataset, Example
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import random
from torch.cuda.amp import autocast, GradScaler

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

In [None]:
PATH_DATA = "/content/drive/MyDrive/SemTabFact/data_task_b_undersampled_60.csv"

In [None]:
df = pd.read_csv(
    PATH_DATA, 
    index_col = 0
)
df.index.name = "id"

In [None]:
df = df.dropna()

In [None]:
df_sample = df.sample(n = 1000000, random_state = 42)

In [None]:
df_sample["relevancy"] = df_sample["relevancy"].astype(float)

In [None]:
display(df_sample)

In [None]:
df_sample["relevancy"].value_counts()

In [None]:
# Split the data in training and validation
df_train, df_val = tts(df_sample, shuffle = True, train_size = 0.8, random_state = 42)

In [None]:
display(df_train)

In [None]:
# Check if GPU is available or not
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Class Distribution

In [None]:
df_train["relevancy"].value_counts()

In [None]:
df_val["relevancy"].value_counts()

# Sentence Transformers

In [None]:
from sentence_transformers.losses import CosineSimilarityLoss

In [None]:
def weightedMSELoss(input, target):
    mse = (input - target)**2 # [64]

    weights = torch.ones(target.size())
    weights[target == 1] = 2 # More weight to minority class
    weights = weights.to(device)

    assert mse.size() == weights.size()
    loss = weights*mse
    return loss.mean()

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, evaluation

In [None]:
model = SentenceTransformer("/content/drive/MyDrive/SemTabFact/stsb_weighted_2_csl_1M_epoch2.h5")

In [None]:
list_sentence1 = df_train["cell_text"].values.tolist()
list_sentence2 = df_train["statement_text"].values.tolist()
list_labels = df_train["relevancy"].values.tolist()

train_examples = []
for i in range(len(list_labels)) :
    texts = []
    texts.append(str(list_sentence1[i]))
    texts.append(str(list_sentence2[i]))
    train_examples.append(InputExample(texts=texts, label=list_labels[i]))

In [None]:
from torch.utils.data import DataLoader

In [None]:
N_BATCH = 64

In [None]:
# Define the train dataset, the dataloader and the train loss
train_dataloader = DataLoader(
    train_examples, 
    shuffle = False, 
    batch_size = N_BATCH
)

In [None]:
train_loss = CosineSimilarityLoss(model, loss_fct = weightedMSELoss)

In [None]:
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    df_val["cell_text"].values.astype(str), 
    df_val["statement_text"].values.astype(str), 
    df_val["relevancy"].values
)

In [None]:
# Tune the model
model.fit(
    train_objectives = [(train_dataloader, train_loss)],
    epochs = 1,
    warmup_steps = 100,
    # evaluator = evaluator,
    # evaluation_steps = 500,
    output_path = "/content/drive/MyDrive/SemTabFact/stsb_weighted_2_csl_1M_epoch3.h5"
)

In [None]:
model.save("/content/drive/MyDrive/SemTabFact/stsb_weighted_2_csl_1M_epoch3.h5")

In [None]:
model.evaluate(evaluator)

In [None]:
val_list_sentence1 = df_val["cell_text"].values.astype(str).tolist()
val_list_sentence2 = df_val["statement_text"].values.astype(str).tolist()
val_list_labels = df_val["relevancy"].values.astype(int).tolist()

val_examples = []
for i in range(len(val_list_labels)) :
    texts = []
    texts.append(str(val_list_sentence1[i]))
    texts.append(str(val_list_sentence2[i]))
    val_examples.append(InputExample(texts=texts, label=val_list_labels[i]))

# Define the train dataset, the dataloader and the train loss
val_dataloader = DataLoader(val_examples, shuffle=True, batch_size=64)

In [None]:
acc_evaluator = evaluation.BinaryClassificationEvaluator(val_list_sentence1, val_list_sentence2, val_list_labels)
model.evaluate(acc_evaluator)
# Gives Average Precision with Cosine-Similarity