In [102]:
import re, math
from collections import Counter
import numpy as np
import pandas as pd

In [103]:
df = pd.read_csv('./train.csv')

In [104]:
pd.set_option("display.max_colwidth", 10000)
df.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone and video games?,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Motorolla DCX3400?,How do I hack Motorola DCX3400 for free internet?,0


In [105]:
class Similarity():
    
    # Cosine Similarity
    def cosine_similarity(self, vector1, vector2):
        
        # intersects the words that are common in the set of the two words
        intersection = set(vector1.keys()) & set(vector2.keys())

        # dot matrix of vector1 and vector2
        numerator = sum([vector1[x] * vector2[x] for x in intersection])

        # sum of the squares of each vector sum1 is the sum of text1 and same for sum2 for text2
        sum1 = sum([vector1[x]**2 for x in vector1.keys()])
        sum2 = sum([vector2[x]**2 for x in vector2.keys()])
        
        # product of the square root of both sum(s)
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            return round(numerator/float(denominator))

    def text_to_tokens(self,text):
        WORD = re.compile(r'\w+')
        words = WORD.findall(text)
        return words

    # Jaccard Similarity
    def jaccard_similarity(self, string1, string2):
        intersection = set(string1).intersection(set(string2))
        union = set(string1).union(set(string2))
        return len(intersection)/float(len(union))

In [106]:
similarity = Similarity()
cosine_similarity = []
jaccard_similarity = []

def get_questions():
    for index, row in df.iterrows():
        ques1, ques2 = row['question1'], row['question2']
        
        # split words into tokens
        token1 = similarity.text_to_tokens(ques1)
        token2 = similarity.text_to_tokens(ques2)
        
        # vector space
        vector1 = Counter(token1)
        vector2 = Counter(token2)
        
        # Cosine Similarity
        cosine = similarity.cosine_similarity(vector1, vector2)
        cosine_similarity.append(round(cosine))
        
        # Jaccard Similarity
        jaccard = similarity.jaccard_similarity(token1,token2)
        jaccard_similarity.append(round(jaccard))

get_questions()
df['Cosine Similarity'] = cosine_similarity
df['Jaccard Similarity'] = jaccard_similarity

In [107]:
df.head(30)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Cosine Similarity,Jaccard Similarity
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0,1,1
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0,1,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0,0,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0,0,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0,0,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1,1,0
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone and video games?,0,0,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,1,0
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,1,1
9,9,19,20,Motorola (company): Can I hack my Charter Motorolla DCX3400?,How do I hack Motorola DCX3400 for free internet?,0,0,0
