In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import os

# Importing Dataset

In [3]:
df = pd.read_csv('Text_Similarity_Dataset.csv')

In [4]:
df

Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...
...,...,...
2995,uk directors guild nominees named martin scors...,steel firm to cut 45 000 jobs mittal steel ...
2996,u2 to play at grammy awards show irish rock ba...,israel looks to us for bank chief israel has a...
2997,pountney handed ban and fine northampton coach...,india and iran in gas export deal india has si...
2998,belle named best scottish band belle & sebas...,mido makes third apology ahmed mido hossam h...


# Importing librabries for text preprocessing

In [5]:
import re
import nltk

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
from nltk.stem.porter import PorterStemmer

# Text Preprocessing

In [120]:
#Preprocessing for the first row of dataset

In [8]:
X = re.sub('[^a-zA-Z]', ' ', df['text1'][0])   #only takes things made up from letters, a-z and A-Z  #No ;%&,No numbers
X = X.lower()
X = word_tokenize(X)  #split all words

In [9]:
Y = re.sub('[^a-zA-Z]', ' ', df['text2'][0])   
Y = Y.lower()
Y = word_tokenize(Y)  

In [10]:
# sw contains the list of stopwords
sw = stopwords.words('english')
#ps.stem() used to stem the word to their original Eg: playing to play
ps = PorterStemmer()
l1 =[];l2 =[]

In [11]:
# remove stop words from the string
X_set = {ps.stem(w) for w in X if not w in sw} 
Y_set = {ps.stem(w) for w in Y if not w in sw}

In [12]:
X_set

{'abil',
 'access',
 'accord',
 'adapt',
 'ago',
 'almost',
 'alway',
 'analyst',
 'audienc',
 'away',
 'beauvillian',
 'behind',
 'big',
 'biggest',
 'britain',
 'broadband',
 'broken',
 'brows',
 'challeng',
 'chang',
 'choic',
 'close',
 'compani',
 'consum',
 'content',
 'continu',
 'countri',
 'doubl',
 'download',
 'eat',
 'encourag',
 'enhanc',
 'entertain',
 'europ',
 'european',
 'expect',
 'explod',
 'face',
 'fast',
 'favour',
 'file',
 'film',
 'five',
 'found',
 'franc',
 'fuell',
 'gabriel',
 'game',
 'germani',
 'greatest',
 'grow',
 'growth',
 'habit',
 'high',
 'home',
 'hook',
 'household',
 'implic',
 'inform',
 'internet',
 'itali',
 'jump',
 'jupit',
 'keep',
 'less',
 'long',
 'lower',
 'made',
 'major',
 'mani',
 'mark',
 'market',
 'meant',
 'media',
 'million',
 'month',
 'music',
 'need',
 'net',
 'netrat',
 'new',
 'newspap',
 'next',
 'nielsen',
 'number',
 'olivi',
 'one',
 'onlin',
 'page',
 'particular',
 'past',
 'peopl',
 'plan',
 'play',
 'popul',
 'po

In [13]:
Y_set

{'abraham',
 'add',
 'adrift',
 'ahead',
 'alexi',
 'also',
 'american',
 'anoth',
 'appear',
 'athen',
 'back',
 'beat',
 'behind',
 'believ',
 'best',
 'botch',
 'break',
 'brett',
 'britain',
 'british',
 'buckfield',
 'came',
 'catherin',
 'champion',
 'chanc',
 'changeov',
 'chri',
 'claim',
 'claxton',
 'clear',
 'clearanc',
 'close',
 'closer',
 'cm',
 'come',
 'competit',
 'cost',
 'coupl',
 'cruis',
 'daniel',
 'djhone',
 'doubl',
 'edg',
 'end',
 'enjoy',
 'event',
 'excit',
 'expect',
 'fade',
 'feat',
 'fedorova',
 'final',
 'finish',
 'first',
 'fit',
 'fourth',
 'franc',
 'garden',
 'get',
 'glasgow',
 'gold',
 'got',
 'grab',
 'green',
 'hard',
 'held',
 'high',
 'hold',
 'holm',
 'home',
 'hope',
 'hurdl',
 'import',
 'indoor',
 'injuri',
 'intern',
 'invit',
 'irina',
 'itali',
 'jade',
 'jame',
 'jason',
 'jenni',
 'johnson',
 'jump',
 'kallur',
 'kick',
 'lambert',
 'last',
 'leap',
 'led',
 'lesli',
 'long',
 'lot',
 'luca',
 'maduaka',
 'major',
 'make',
 'mark',
 

In [14]:
# form a set containing keywords of both strings 
rvector = X_set.union(Y_set) 
for w in rvector:
    if w in X_set: l1.append(1) # create a vector
    else: l1.append(0)
    if w in Y_set: l2.append(1)
    else: l2.append(0)

In [15]:
len(l1)

302

In [16]:
len(l2)

302

In [17]:
c = 0
  
# cosine formula for finding similarity score b/w l1 and l2
for i in range(len(rvector)):
    c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
print("similarity: ", cosine)

similarity:  0.14267836335553657


In [None]:
#Applying the above operations on the whole dataframe

In [18]:
def count_frequency(word_list):   #function for creating dictionary of frequencies for all words
      
    D = {}
      
    for new_word in word_list:
          
        if new_word in D:
            D[new_word] = D[new_word] + 1
              
        else:
            D[new_word] = 1
              
    return D

In [19]:
import math    

#All functions used for finding cosine value i.e. used for similarity score

def dotproduct(v1, v2):
    return sum((a*b) for a, b in zip(v1, v2))

def length(v):
    return math.sqrt(dotproduct(v, v))

def angle(v1, v2):
    return math.acos(dotproduct(v1, v2) / (length(v1) * length(v2)))

def cosine_angle(v1, v2):
    return dotproduct(v1, v2) / (length(v1) * length(v2))

In [19]:
#Creating a for loop for preprocessing and similarity score calculation of whole dataframe

In [20]:
sw = stopwords.words('english') 
ps = PorterStemmer()

for i in range(0, 3000):
    X = re.sub('[^a-zA-Z]', ' ', df['text1'][i])  
    X = X.lower()
    X = word_tokenize(X) 
    
    Y = re.sub('[^a-zA-Z]', ' ', df['text2'][i]) 
    Y = Y.lower()
    Y = word_tokenize(Y)  
    
    l1 =[];l2 =[]
    
    X_set = {ps.stem(w) for w in X if not w in sw} 
    Y_set = {ps.stem(w) for w in Y if not w in sw}
    
    D1 = count_frequency(X_set)
    D2 = count_frequency(Y_set)
    
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    
    print('Row index: ', i)
    print('Similarity: ', cosine_angle(l1, l2))

Row index:  0
Similarity:  0.14267836335553657
Row index:  1
Similarity:  0.0674199862463242
Row index:  2
Similarity:  0.07297211359952455
Row index:  3
Similarity:  0.07644974551833317
Row index:  4
Similarity:  0.134637813499663
Row index:  5
Similarity:  0.11155885718343433
Row index:  6
Similarity:  0.03820455033741685
Row index:  7
Similarity:  0.09363006756991821
Row index:  8
Similarity:  0.04195315943520017
Row index:  9
Similarity:  0.0849250307237733
Row index:  10
Similarity:  0.13705061117171075
Row index:  11
Similarity:  0.09415130835240083
Row index:  12
Similarity:  0.15302805979888587
Row index:  13
Similarity:  0.1563215837502731
Row index:  14
Similarity:  0.08422919413386756
Row index:  15
Similarity:  0.06834861261734088
Row index:  16
Similarity:  0.2393851149226017
Row index:  17
Similarity:  0.2839536610926862
Row index:  18
Similarity:  0.10333567582603505
Row index:  19
Similarity:  0.0894141111609582
Row index:  20
Similarity:  0.10836965077746659
Row index: