In [26]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('words')
# nltk.download('averaged_perceptron_tagger')

In [39]:
# pip install spacy

In [3]:
# pip install sentence-transformers

THIS NOTEBOOK CONTAINS THE DATASET EXPLORATION, PREPEOCESSING AND TYPES OF VECTORIZATION.


WORDS-EMBEDDINGS -> TF-IDF
SENTENCE-EMBEDDINGS -> DOC2VEC

MODELS USED -> SVM, COSINE SIMILARITY

In [1]:
import pandas as pd
import numpy as np
import regex as re
from tqdm import tqdm
import os 
import contractions

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report

import spacy
from spacy import displacy

from sentence_transformers import SentenceTransformer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import FastText

import warnings
warnings.filterwarnings('ignore')

In [2]:
current_dir = os.getcwd()
root = os.path.dirname(current_dir)
data_dir = os.path.join(root,'data')
data_path = os.path.join(data_dir,'msr_paraphrase_train.txt')

In [4]:
quality = []
ID_1 = []
ID_2 = []
STRING_1 = []
STRING_2 = []

text_lines = []

In [5]:
fe = open(data_path,'r',encoding='utf8')
columns = fe.readline().removeprefix('\ufeff').removesuffix('\n').split('\t')
# print(columns)
for line in fe:
    text_lines.append(line.strip().split('/n'))
    

In [6]:
for lines in text_lines:
    line = str(lines[0])
    split_lines = line.split('\t')
    quality.append(split_lines[0])
    STRING_1.append(split_lines[3])
    STRING_2.append(split_lines[4])

fe.close()

In [7]:
df = pd.DataFrame(np.column_stack([quality,STRING_1,STRING_2]),columns=['Quality','String-1','String-2'])

In [8]:
df

Unnamed: 0,Quality,String-1,String-2
0,1,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...
...,...,...,...
4071,1,"""At this point, Mr. Brando announced: 'Somebod...","Brando said that ""somebody ought to put a bull..."
4072,0,"Martin, 58, will be freed today after serving ...",Martin served two thirds of a five-year senten...
4073,1,"""We have concluded that the outlook for price ...","In a statement, the ECB said the outlook for p..."
4074,1,The notification was first reported Friday by ...,MSNBC.com first reported the CIA request on Fr...


In [9]:
df = pd.read_csv(os.path.join(data_dir,'train_data.csv'))

In [7]:
# df.to_csv('train_data.csv')

In [10]:
df['Quality'] = df['Quality'].astype('int')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4076 entries, 0 to 4075
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4076 non-null   int64 
 1   Quality     4076 non-null   int32 
 2   String-1    4076 non-null   object
 3   String-2    4076 non-null   object
dtypes: int32(1), int64(1), object(2)
memory usage: 111.6+ KB


In [10]:
# df.to_csv('dataframe.csv')

In [12]:
stop_words = set(stopwords.words('english'))

In [16]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

#### ANALYSING FOR BOTH THE LABELS

In [80]:
df_true = df[df["Quality"]==1]

In [81]:
df_false = df[df['Quality']==0]

In [82]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [83]:
def preprocess(text):
    text = text.lower() # comvert to lower case
    text = re.sub(r'[\d]','',text)
    text = contractions.fix(text) #fixing contractions
    text = re.sub(r'[^\w\s]','',text) #removing punctuations -> removing any character that is not a word or whitespace
    text = word_tokenize(text) #tokenizing
    text = [i for i in text if i not in stop_words] #removing stop words
    text = [i for i in text if not i.isdigit()] # removing numbers
    #text = [stemmer.stem(i) for i in text] # stemming
    text = [lemmatizer.lemmatize(i) for i in text] # lemmatizing
    return ' '.join(text)

In [84]:
df_true['String_1_preprocessed'] = tqdm(df_true['String-1'].apply(preprocess))

100%|██████████| 2753/2753 [00:00<00:00, 2473633.01it/s]


In [85]:
df_true['String_2_preprocessed'] = tqdm(df_true['String-2'].apply(preprocess))

100%|██████████| 2753/2753 [00:00<00:00, 2953176.19it/s]


In [86]:
df_true.columns

Index(['Unnamed: 0', 'Quality', 'String-1', 'String-2',
       'String_1_preprocessed', 'String_2_preprocessed'],
      dtype='object')

In [87]:
df_true = df_true.loc[:,['Quality','String_1_preprocessed','String_2_preprocessed']]
df_true

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
0,1,amrozi accused brother called witness delibera...,referring witness amrozi accused brother delib...
2,1,published advertisement internet june offering...,june ship owner published advertisement intern...
4,1,stock rose percent close friday new york stock...,pge corp share jumped percent new york stock e...
5,1,revenue first quarter year dropped percent per...,scandal hanging stewart company revenue first ...
7,1,dvdcca appealed state supreme court,dvd cca appealed decision yous supreme court
...,...,...,...
4069,1,knox county health department following nation...,health department spokesperson added departmen...
4070,1,new rule allow single company tv station reach...,changed national ownership limit allows compan...
4071,1,point mr brando announced somebody ought put b...,brando said somebody ought put bullet head acc...
4073,1,concluded outlook price stability medium term ...,statement ecb said outlook price stability med...


In [88]:
df_true = df_true.iloc[:1500,:]
df_true.to_csv(path_or_buf=os.path.join(data_dir,'df_true_preprocessed_1.csv'))

In [89]:
df_false['String_1_preprocessed'] = tqdm(df_false['String-1'].apply(preprocess))

100%|██████████| 1323/1323 [00:00<?, ?it/s]


In [90]:
df_false['String_2_preprocessed'] = tqdm(df_false['String-2'].apply(preprocess))

100%|██████████| 1323/1323 [00:00<?, ?it/s]


In [91]:
df_false = df_false.loc[:,['Quality','String_1_preprocessed','String_2_preprocessed']]
df_false

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
1,0,yucaipa owned dominick selling chain safeway b...,yucaipa bought dominick million sold safeway b...
3,0,around gmt tab share cent earlier set record high,tab share jumped cent set record closing high
6,0,nasdaq weekly gain percent closing friday,techlaced nasdaq composite ixic rallied point ...
8,0,compared million cent per share yearago period,earnings affected nonrecurring million tax ben...
10,0,share genentech much larger company several pr...,share xoma fell percent early trade share gene...
...,...,...,...
4058,0,mr mcdevitt granted control three crucial aspe...,mr mcdevitt granted control three aspect polic...
4059,0,linksys overtook cisco system leading wireless...,rolfe said linksys overtook cisco system last ...
4060,0,rt jones analyst juli niemann said grant one p...,good reputation rt jones analyst juli niemann ...
4072,0,martin freed today serving two third fiveyear ...,martin served two third fiveyear sentence mans...


In [92]:
df_final = pd.concat([df_true,df_false],axis=0)
df_final = df_final.reset_index()
df_final.to_csv(os.path.join(data_dir,'df_final.csv'))

In [3]:
df_final = pd.read_csv(os.path.join(data_dir,'df_final.csv'),index_col=0).drop(columns=['index'],axis=1)
df_final.head(10)

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
0,1,amrozi accused brother called witness delibera...,referring witness amrozi accused brother delib...
1,1,published advertisement internet june offering...,june ship owner published advertisement intern...
2,1,stock rose percent close friday new york stock...,pge corp share jumped percent new york stock e...
3,1,revenue first quarter year dropped percent per...,scandal hanging stewart company revenue first ...
4,1,dvdcca appealed state supreme court,dvd cca appealed decision yous supreme court
5,1,said foodservice pie business fit company long...,foodservice pie business fit longterm growth s...
6,1,added group performance would improve second h...,de sole said result statement group performanc...
7,1,told sun newspaper mr hussein daughter british...,saddam daughter british school hospital mind d...
8,1,sheena young child national infertility suppor...,sheena young spokesman child national infertil...
9,1,new finder put user folder hard drive network ...,panther redesigned finder navigation tool put ...


In [4]:
df_final.dropna(inplace=True)

In [5]:
len(df_final)

2823

In [6]:
x = df_final.iloc[:,1:]
y = df_final.iloc[:,0]

In [7]:
x

Unnamed: 0,String_1_preprocessed,String_2_preprocessed
0,amrozi accused brother called witness delibera...,referring witness amrozi accused brother delib...
1,published advertisement internet june offering...,june ship owner published advertisement intern...
2,stock rose percent close friday new york stock...,pge corp share jumped percent new york stock e...
3,revenue first quarter year dropped percent per...,scandal hanging stewart company revenue first ...
4,dvdcca appealed state supreme court,dvd cca appealed decision yous supreme court
...,...,...
2818,mr mcdevitt granted control three crucial aspe...,mr mcdevitt granted control three aspect polic...
2819,linksys overtook cisco system leading wireless...,rolfe said linksys overtook cisco system last ...
2820,rt jones analyst juli niemann said grant one p...,good reputation rt jones analyst juli niemann ...
2821,martin freed today serving two third fiveyear ...,martin served two third fiveyear sentence mans...


In [104]:
x_train = [(df_final['String_1_preprocessed'][i]+ "<EOS>" + df_final['String_2_preprocessed'][i]) for i in range(len(df_final))]

In [105]:
x_train[:5]

['amrozi accused brother called witness deliberately distorting evidence<EOS>referring witness amrozi accused brother deliberately distorting evidence',
 'published advertisement internet june offering cargo sale added<EOS>june ship owner published advertisement internet offering explosive sale',
 'stock rose percent close friday new york stock exchange<EOS>pge corp share jumped percent new york stock exchange friday',
 'revenue first quarter year dropped percent period year earlier<EOS>scandal hanging stewart company revenue first quarter year dropped percent period year earlier',
 'dvdcca appealed state supreme court<EOS>dvd cca appealed decision yous supreme court']

### USING BOW

In [47]:
vectorizer = CountVectorizer()
vectorizer.fit(x['String_1_preprocessed'],x['String_2_preprocessed'])

In [48]:
X_train1 = vectorizer.transform(x['String_1_preprocessed'])
X_train2 = vectorizer.transform(x['String_2_preprocessed'])

In [51]:
def cosine(data1,data2):
    score = cosine_similarity(data1,data2)[0][0]
    return 1 if score>0.60 else 0

In [56]:
df_final_test.head()
x_test = df_final_test.iloc[:,1:]
y_test = df_final_test.iloc[:,0]

In [54]:
X_test1 = vectorizer.transform(x_test['String_1_preprocessed'])
X_test2 = vectorizer.transform(x_test['String_2_preprocessed'])

In [55]:
y_pred = []
for i in range(X_test1.shape[0]):
    y_pred+=[cosine(X_test1[i],X_test2[i])]

In [57]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.52      0.54      0.53       557
           1       0.78      0.76      0.77      1168

    accuracy                           0.69      1725
   macro avg       0.65      0.65      0.65      1725
weighted avg       0.69      0.69      0.69      1725



### USING TF-IDF

In [106]:
tfidf_vectorizer = TfidfVectorizer()

In [107]:
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

In [108]:
svc = SVC(C=1000,kernel='rbf')

In [109]:
svc.fit(x_train_tfidf,y)

In [110]:
tfidf_vectorizer.get_feature_names_out()

array(['_enough', 'aan', 'aaron', ..., 'zulfiqar', 'zulifquar', 'µguru'],
      dtype=object)

TEST-DATA

In [111]:
quality = []
ID_1 = []
ID_2 = []
STRING_1 = []
STRING_2 = []

text_lines = []

In [112]:
fe = open(os.path.join(data_dir,'msr_paraphrase_test.txt'),'r',encoding='utf8')
columns = fe.readline().removeprefix('\ufeff').removesuffix('\n').split('\t')
# print(columns)
for line in fe:
    text_lines.append(line.strip().split('/n'))
    

In [113]:
for lines in text_lines:
    line = str(lines[0])
    split_lines = line.split('\t')
    quality.append(split_lines[0])
    STRING_1.append(split_lines[3])
    STRING_2.append(split_lines[4])

fe.close()

In [114]:
df_test = pd.DataFrame(np.column_stack([quality,STRING_1,STRING_2]),columns=['Quality','String-1','String-2'])

In [115]:
df_test['Quality'] = df_test['Quality'].astype('int')

In [116]:
df_test.to_csv(os.path.join(data_dir,'test_data.csv'))

In [117]:
df_test_true = df_test[df_test["Quality"]==1]

In [118]:
df_test_false = df_test[df_test["Quality"]==0]

In [119]:
df_test_true['String_1_preprocessed'] = tqdm(df_test_true['String-1'].apply(preprocess))

100%|██████████| 1147/1147 [00:00<?, ?it/s]


In [120]:
df_test_true['String_2_preprocessed'] = tqdm(df_test_true['String-2'].apply(preprocess))

100%|██████████| 1147/1147 [00:00<00:00, 1155624.96it/s]


In [121]:
df_test_true.columns

Index(['Quality', 'String-1', 'String-2', 'String_1_preprocessed',
       'String_2_preprocessed'],
      dtype='object')

In [122]:
df_test_true = df_test_true.loc[:,['Quality','String_1_preprocessed','String_2_preprocessed']]
df_test_true

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
0,1,pccws chief operating officer mike butcher ale...,current chief operating officer mike butcher g...
1,1,world two largest automaker said yous sale dec...,domestic sale gm ford motor co declined predic...
2,1,according federal center disease control preve...,center disease control prevention said reporte...
5,1,settling company would also assign possible cl...,agreement settling company also assign potenti...
7,1,washington county man may county first human c...,county first human case west nile year confirm...
...,...,...,...
1716,1,gehring waived extradition monday hearing san ...,gehring waived extradition monday hearing sant...
1717,1,advised certain allegation criminal conduct in...,advised certain allegation criminal conduct in...
1719,1,deal approved company board director expected ...,acquisition approved company board director ex...
1723,1,last week power station u owner aes corp walke...,news come draxs american owner aes corp aesn l...


In [124]:
df_test_true.to_csv(os.path.join(data_dir,'df_test_true_preprocessed_1.csv'))

In [125]:
df_test_false['String_1_preprocessed'] = tqdm(df_test_false['String-1'].apply(preprocess))

100%|██████████| 578/578 [00:00<00:00, 287717.51it/s]


In [126]:
df_test_false['String_2_preprocessed'] = tqdm(df_test_false['String-2'].apply(preprocess))

100%|██████████| 578/578 [00:00<00:00, 577628.71it/s]


In [127]:
df_test_false = df_test_false.loc[:,['Quality','String_1_preprocessed','String_2_preprocessed']]
df_test_false

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
3,0,tropical storm rapidly developed gulf mexico s...,tropical storm rapidly developed gulf mexico s...
4,0,company detail cost replacement repair,company official expect cost replacement work ...
6,0,air commodore quaife said hornet remained thre...,air commodore john quaife said security operat...
9,0,broader standard poor index spx point lower pe...,technologylaced nasdaq composite index ixic po...
13,0,hong kong flat australia singapore south korea...,australia flat singapore percent midday south ...
...,...,...,...
1713,0,planned stay day river crested forecast late l...,lawyer planned stay river start receding
1718,0,crew worked install new culvert prepare highwa...,crew worked install new culvert repave highway...
1720,0,hughes refused rehire hernandez complained equ...,hernandez filed equal employment opportunity c...
1721,0,democrat assembly republican,democrat dominate assembly republican control ...


In [130]:
df_final_test = pd.concat([df_test_true,df_test_false],axis=0)
df_final_test = df_final_test.reset_index().drop(columns=['index'],axis=1)
df_final_test.to_csv(os.path.join(data_dir,'df_final_test.csv'))

In [8]:
df_final_test = pd.read_csv(os.path.join(data_dir,'df_final_test.csv'),index_col=0)

In [9]:
df_final_test.head()

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
0,1,pccws chief operating officer mike butcher ale...,current chief operating officer mike butcher g...
1,1,world two largest automaker said yous sale dec...,domestic sale gm ford motor co declined predic...
2,1,according federal center disease control preve...,center disease control prevention said reporte...
3,1,settling company would also assign possible cl...,agreement settling company also assign potenti...
4,1,washington county man may county first human c...,county first human case west nile year confirm...


In [10]:
len(df_final_test)

1725

In [11]:
x_test = df_final_test.iloc[:,1:]
y_test = df_final_test.iloc[:,0]

In [136]:
x_test = [(df_final_test['String_1_preprocessed'][i]+ "<EOS>" + df_final_test['String_2_preprocessed'][i]) for i in range(len(df_final_test))]
x_test_tfidf = tfidf_vectorizer.transform(x_test)
y_pred = svc.predict(x_test_tfidf)

In [138]:
print(classification_report(y_pred=y_pred,y_true=y_test))

              precision    recall  f1-score   support

           0       0.48      0.51      0.49       578
           1       0.74      0.72      0.73      1147

    accuracy                           0.65      1725
   macro avg       0.61      0.61      0.61      1725
weighted avg       0.65      0.65      0.65      1725



#### USING FastText

In [12]:
df_final['String_1_preprocessed_split'] = df_final['String_1_preprocessed'].apply(lambda x : x.split())
df_final['String_2_preprocessed_split'] = df_final['String_2_preprocessed'].apply(lambda x : x.split())

df_final_test['String_1_preprocessed_split'] = df_final_test['String_1_preprocessed'].apply(lambda x : x.split())
df_final_test['String_2_preprocessed_split'] = df_final_test['String_2_preprocessed'].apply(lambda x : x.split())

In [13]:
sentences_train = df_final['String_1_preprocessed_split'].tolist() + df_final['String_2_preprocessed_split'].tolist()
sentences_test = df_final_test['String_1_preprocessed_split'].tolist() + df_final_test['String_2_preprocessed_split'].tolist()

In [14]:
ft_model = FastText(sentences=sentences_train,vector_size=100,epochs=50,window=3,workers=4,sg=1,min_count=3)

In [15]:
def get_word_vector_ft(word):
    if word in ft_model.wv:
        return ft_model.wv[word]
    else:
        return np.zeros(100)
def get_sent_vect_ft(sent,maxlen=10):
    sent_len = len(sent)
    sent_vect = []
    if sent_len<maxlen:
        for word in sent:
            sent_vect.append(get_word_vector_ft(word))
        for i in range(maxlen-len(sent_vect)):
            sent_vect.append(np.zeros(100))
    else:
        for i in range(maxlen):
            sent_vect.append(get_word_vector_ft(sent[i]))
    return np.asarray(sent_vect)

In [20]:
df_final['str_1_vector'] = df_final['String_1_preprocessed'].apply(lambda x : get_sent_vect_ft(x))
df_final['str_2_vector'] = df_final['String_2_preprocessed'].apply(lambda x : get_sent_vect_ft(x))

df_final_test['str_1_vector'] = df_final_test['String_1_preprocessed'].apply(lambda x : get_sent_vect_ft(x))
df_final_test['str_2_vector'] = df_final_test['String_2_preprocessed'].apply(lambda x : get_sent_vect_ft(x))

In [29]:
y_pred=[]
sim_scores = []
for i in range(len(df_final_test)):
    sent_1 = df_final_test['str_1_vector'][i]
    sent_2 = df_final_test['str_2_vector'][i]
    sim_score = cosine_similarity(sent_1,sent_2)[0][0]
    sim_scores.append(sim_score)
    if sim_score>0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

print(classification_report(y_pred=y_pred,y_true=y_test))


              precision    recall  f1-score   support

           0       0.40      0.32      0.36       578
           1       0.69      0.76      0.72      1147

    accuracy                           0.61      1725
   macro avg       0.55      0.54      0.54      1725
weighted avg       0.59      0.61      0.60      1725



### USING SIMILARITY SCORE

#### SENTENCE EMBEDDING -> DOC2VEC

In [18]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [19]:
x

Unnamed: 0,String_1_preprocessed,String_2_preprocessed
0,amrozi accused brother called witness delibera...,referring witness amrozi accused brother delib...
1,published advertisement internet june offering...,june ship owner published advertisement intern...
2,stock rose percent close friday new york stock...,pge corp share jumped percent new york stock e...
3,revenue first quarter year dropped percent per...,scandal hanging stewart company revenue first ...
4,dvdcca appealed state supreme court,dvd cca appealed decision yous supreme court
...,...,...
2818,mr mcdevitt granted control three crucial aspe...,mr mcdevitt granted control three aspect polic...
2819,linksys overtook cisco system leading wireless...,rolfe said linksys overtook cisco system last ...
2820,rt jones analyst juli niemann said grant one p...,good reputation rt jones analyst juli niemann ...
2821,martin freed today serving two third fiveyear ...,martin served two third fiveyear sentence mans...


In [20]:
x['String_1_split'] = x['String_1_preprocessed'].apply(lambda x : word_tokenize(x))
x['String_2_split'] = x['String_2_preprocessed'].apply(lambda x : word_tokenize(x))

In [21]:
tokenized_sent = x['String_1_split'].tolist() + x['String_2_split'].tolist()

In [154]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
tagged_data[:5]

[TaggedDocument(words=['amrozi', 'accused', 'brother', 'called', 'witness', 'deliberately', 'distorting', 'evidence'], tags=[0]),
 TaggedDocument(words=['published', 'advertisement', 'internet', 'june', 'offering', 'cargo', 'sale', 'added'], tags=[1]),
 TaggedDocument(words=['stock', 'rose', 'percent', 'close', 'friday', 'new', 'york', 'stock', 'exchange'], tags=[2]),
 TaggedDocument(words=['revenue', 'first', 'quarter', 'year', 'dropped', 'percent', 'period', 'year', 'earlier'], tags=[3]),
 TaggedDocument(words=['dvdcca', 'appealed', 'state', 'supreme', 'court'], tags=[4])]

In [157]:
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

In [162]:
# model.wv

In [172]:
# test_doc = word_tokenize("I had pizza and pasta".lower())
# test_doc_vector = model.infer_vector(test_doc)
# model.docvecs.most_similar(positive = [test_doc_vector])

def get_sent_vector(sent):
    sent = word_tokenize(sent)
    sent_vector = model.infer_vector(sent)
    return sent_vector

def get_sim_score(sent1,sent2):
    return cosine(get_sent_vector(sent1),get_sent_vector(sent2))

In [173]:
df_final_test.head()

Unnamed: 0,Quality,String_1_preprocessed,String_2_preprocessed
0,1,pccws chief operating officer mike butcher ale...,current chief operating officer mike butcher g...
1,1,world two largest automaker said yous sale dec...,domestic sale gm ford motor co declined predic...
2,1,according federal center disease control preve...,center disease control prevention said reporte...
3,1,settling company would also assign possible cl...,agreement settling company also assign potenti...
4,1,washington county man may county first human c...,county first human case west nile year confirm...


In [177]:
y_pred=[]
sim_scores = []
for i in range(len(df_final_test)):
    sent_1 = df_final_test['String_1_preprocessed'][i]
    sent_2 = df_final_test['String_2_preprocessed'][i]
    sim_score = get_sim_score(sent_1,sent_2)
    sim_scores.append(sim_score)
    if sim_score>0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

print(classification_report(y_pred=y_pred,y_true=y_test))


              precision    recall  f1-score   support

           0       0.50      0.42      0.46       578
           1       0.73      0.79      0.76      1147

    accuracy                           0.67      1725
   macro avg       0.62      0.61      0.61      1725
weighted avg       0.65      0.67      0.66      1725



### SENTENCE EMBEDDING ->SENTENCE BERT

In [8]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [24]:
sentences = x['String_1_preprocessed'].tolist() + x['String_2_preprocessed'].tolist()

In [25]:
sentence_embeddings = sbert_model.encode(sentences)

In [29]:
print(sentences[0])
print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))
print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

amrozi accused brother called witness deliberately distorting evidence
Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [-4.84880293e-03  6.70211792e-01  9.93917346e-01  4.23760623e-01
  3.49516392e-01  1.50311485e-01  7.54621506e-01 -2.76908726e-01
  4.35927570e-01  1.13792725e-01 -2.95178175e-01  5.54410756e-01
  7.54509494e-02  9.08544123e-01 -1.15884709e+00  7.80411810e-02
  7.81229064e-02 -9.46962535e-01  1.07690468e-01 -3.94617260e-01
 -1.15400357e-02  5.23896003e-03  9.71267596e-02  2.73131490e-01
  2.96102345e-01  6.90533161e-01 -4.71219361e-01 -9.52201664e-01
 -1.55344808e+00  3.86499017e-01  9.34361741e-02  6.51992917e-01
  2.73635029e-03 -1.51803970e-01 -7.50415742e-01  6.34284496e-01
  9.08018231e-01 -5.41589677e-01 -4.60812189e-02  5.35555780e-01
  5.72790980e-01  6.91431701e-01 -3.84634763e-01  1.51890725e-01
 -5.02098083e-01 -1.57897115e-01  4.89452899e-01  9.82423246e-01
  9.17938113e-01 -6.99086368e-01  2.57459700e-

In [30]:
def get_bert_vector(sent):
    vector = sbert_model.encode([sent])[0]
    return vector

In [31]:
def get_bert_sim_score(sent1,sent2):
    sent_1_vector = sbert_model.encode([sent1])[0]
    sent_2_vector = sbert_model.encode([sent2])[0]

    sim_score = cosine(sent_1_vector,sent_2_vector)
    return sim_score

In [None]:
y_pred = []
bert_sim_scores = []
for i in range(len(df_final_test)):
    sent_1 = df_final_test['String_1_preprocessed'][i]
    sent_2 = df_final_test['String_2_preprocessed'][i]
    sim_score = get_bert_sim_score(sent_1,sent_2)
    bert_sim_scores.append(sim_score)
    if sim_score>0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)        

In [44]:
print(classification_report(y_pred=y_pred,y_true=y_test))

              precision    recall  f1-score   support

           0       0.74      0.03      0.06       578
           1       0.67      0.99      0.80      1147

    accuracy                           0.67      1725
   macro avg       0.70      0.51      0.43      1725
weighted avg       0.69      0.67      0.55      1725

