<a href="https://colab.research.google.com/github/ygGao1120/Onclusive-data-challenge/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.chdir('drive/MyDrive/interview/Onclusive data assignment')
!ls

In [None]:
pip install transformers

In [None]:
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter

import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
#import data
train = pd.read_csv('train.tsv',sep = '\t')
valid = pd.read_csv('dev.tsv',sep = '\t')
test = pd.read_csv('test.tsv',sep = '\t')


In [None]:
#drop missin value in the 4 columns
train = train.dropna(axis=0,subset = ['label','claim','explanation','main_text'])
valid = valid.dropna(axis=0,subset = ['label','claim','explanation','main_text'])
test = test.dropna(axis=0,subset = ['label','claim','explanation','main_text'])

In [None]:
# transfer labels into numbers
train['label'].replace(to_replace = ['false','mixture','true','unproven','snopes'],value = [0,1,2,3,4],inplace = True)
valid['label'].replace(to_replace = ['false','mixture','true','unproven','National, Candidate Biography, Donald Trump, '],value = [0,1,2,3,5],inplace = True)
test['label'].replace(to_replace = ['false','mixture','true','unproven'],value = [0,1,2,3],inplace = True)

In [None]:
train['label'] = train['label'].astype(int)
valid['label'] = valid['label'].astype(int)
test['label'] = test['label'].astype(int)

In [None]:
#function for selecting the sentences most related to claim in main_text
def select_evidence_sentences(corpus, k = 5):
    """Select top k evidence sentences (in main_text) based on sentence transformer model."""
    sentence_transformer_model = SentenceTransformer('bert-base-nli-mean-tokens')#sentence bert
    corpus['top_k'] = np.empty([len(corpus),], dtype=str)

    for index, row in corpus.iterrows():
        claim = row['claim']
        sentences = [claim] + [
                     sentence for sentence in sent_tokenize(row['main_text'])]

        sentence_embeddings = sentence_transformer_model.encode(sentences)# output is matrix
        #print('sentence_embeddings',sentence_embeddings)
        claim_embedding = sentence_embeddings[0]# first row of sentence_embeddings
        #print('claim_embeddings',claim_embedding)
        sentence_embeddings = sentence_embeddings[1:]
        cosine_similarity_emb = {}

        for sentence, embedding in zip(sentences, sentence_embeddings):
            cosine_similarity_emb[sentence] = np.linalg.norm(cosine_similarity(
                [claim_embedding, embedding]))

        top_k = dict(sorted(cosine_similarity_emb.items(), 
                            key=itemgetter(1))[:k])
        if not top_k:
            top_k = row['main_text']
        else:
            corpus.at[index, 'top_k'] = ' '.join(key for key in top_k.keys())

    df = pd.DataFrame(columns=['claim', 'top_k', 'label', 'explanation'])
    df['claim'] = corpus['claim']
    df['top_k'] = corpus['top_k']
    df['label'] = corpus['label']
    df['explanation'] = corpus['explanation']
    return df

In [None]:
Train = select_evidence_sentences(train, k = 5)
Valid = select_evidence_sentences(valid, k = 5)
Test = select_evidence_sentences(test, k = 5)

In [None]:
Train.top_k.replace('',np.nan,inplace = True)
Train.top_k.replace(0,np.nan,inplace = True)
Train.explanation.replace(' ',np.nan,inplace = True)

In [None]:
Train = Train.dropna(axis=0,subset = ['top_k', 'explanation'])# a nan in top_k of Train
Valid = Valid.dropna(axis=0,subset = ['top_k', 'explanation'])
Test = Test.dropna(axis=0,subset = ['top_k', 'explanation'])

In [None]:
#prepare for sentence cleaning process
nlp = spacy.load('en_core_web_sm')
stop_word = list(STOP_WORDS)

In [None]:
#function for data cleaning
def clean_str(s):
    #remove symbols and numbers
    s = re.sub(r"\\n","",s)
    s = re.sub(r"[^A-Za-z]"," ",s)
    s = s.strip().lower()
    
    #tokenize
    s = word_tokenize(s)
    
    #Stemming
    c = []
    for word in s:
      doc = nlp(word)
      for token in doc:
        #remove stop word
        if token.lemma_ not in stop_word:
            c.append(token.lemma_)
    
    return c

In [None]:
#cleaning
Train['clean_claim'] = Train.apply(lambda x: clean_str(x.claim),axis = 1)
Train['clean_top_k'] = Train.apply(lambda x: clean_str(x.top_k),axis = 1)
Train['clean_explanation'] = Train.apply(lambda x: clean_str(x.explanation),axis = 1)

Valid['clean_claim'] = Valid.apply(lambda x: clean_str(x.claim),axis = 1)
Valid['clean_top_k'] = Valid.apply(lambda x: clean_str(x.top_k),axis = 1)
Valid['clean_explanation'] = Valid.apply(lambda x: clean_str(x.explanation),axis = 1)

Test['clean_claim'] = Test.apply(lambda x: clean_str(x.claim),axis = 1)
Test['clean_top_k'] = Test.apply(lambda x: clean_str(x.top_k),axis = 1)
Test['clean_explanation'] = Test.apply(lambda x: clean_str(x.explanation),axis = 1)

In [None]:
Train = Train.drop(columns = 'Unnamed: 0')
Valid = Valid.drop(columns = 'Unnamed: 0')
Test = Test.drop(columns = 'Unnamed: 0')

In [None]:
Train = Train.loc[Train.label.isin([0,1,2,3])]
Valid = Valid.loc[Valid.label.isin([0,1,2,3])]
Test = Test.loc[Test.label.isin([0,1,2,3])]

In [None]:
#saved as excel
Train.to_excel('cleaned_train_list.xlsx')
Valid.to_excel('cleaned_valid_list.xlsx')
Test.to_excel('cleaned_test_list.xlsx')