In [1]:
import sys
import pandas as pd
import math
import re
import numpy as np
from collections import Counter
import gensim 
import logging
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from itertools import product
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')



In [2]:
data=pd.read_csv("dataset/rajyasabha_questions_and_answers_2009.csv")

In [3]:
titles=data['question_title']
for i in range(10):
    print('Title No '+str(i+1)+'\n')
    print(titles[i]+'\n')

Title No 1

SPURT IN PRICES OF GOLD .

Title No 2

OPENING OF TRADE CENTRES IN LATIN AMERICAN COUNTRIES .

Title No 3

EARLY EXIT OF CHINESE BUSINESSMEN FROM TRADE FAIR .

Title No 4

DONATION BY STC AND MMTC TO STUDENT WINGS OF POLITICAL PARTIES .

Title No 5

ENVISAGED EXPORT EARNING TARGETS .

Title No 6

REQUESTS FOR DENOTIFYING APPROVED SEZS .

Title No 7

TRADE BETWEEN INDIA AND RUSSIA .

Title No 8

SEZS IN MAHARASHTRA .

Title No 9

DONATION BY STC MMTC TO NSUI .

Title No 10

ANTI DUMPING CASES REGISTERED BY DGAD .



In [4]:
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85

In [5]:
def get_best_synset_pair(word_1, word_2):
    
    max_sim = -1.0
    synsets_1 = wn.synsets(word_1)
    synsets_2 = wn.synsets(word_2)
    if len(synsets_1) == 0 or len(synsets_2) == 0:
        return None, None
    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
               sim = wn.path_similarity(synset_1, synset_2)
               if sim!=None and sim > max_sim:
                   max_sim = sim
                   best_pair = synset_1, synset_2
        return best_pair

In [6]:
def length_dist(synset_1, synset_2):
    
    l_dist = sys.maxsize
    if synset_1 is None or synset_2 is None: 
        return 0.0
    if synset_1 == synset_2:
        # if synset_1 and synset_2 are the same synset return 0
        l_dist = 0.0
    else:
        wset_1 = set([str(x.name()) for x in synset_1.lemmas()])        
        wset_2 = set([str(x.name()) for x in synset_2.lemmas()])
        if len(wset_1.intersection(wset_2)) > 0:
            # if synset_1 != synset_2 but there is word overlap, return 1.0
            l_dist = 1.0
        else:
            # just compute the shortest path between the two
            l_dist = synset_1.shortest_path_distance(synset_2)
            if l_dist is None:
                l_dist = 0.0
    # normalize path length to the range [0,1]
    return math.exp(-ALPHA * l_dist)

In [7]:
def hierarchy_dist(synset_1, synset_2):
    
    h_dist = sys.maxsize
    if synset_1 is None or synset_2 is None: 
        return h_dist
    if synset_1 == synset_2:
        # return the depth of one of synset_1 or synset_2
        h_dist = max([x[1] for x in synset_1.hypernym_distances()])
    else:
        # find the max depth of least common subsumer
        hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}
        hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}
        lcs_candidates = set(hypernyms_1.keys()).intersection(
            set(hypernyms_2.keys()))
        if len(lcs_candidates) > 0:
            lcs_dists = []
            for lcs_candidate in lcs_candidates:
                lcs_d1 = 0
                if lcs_candidate in hypernyms_1:
                    lcs_d1 = hypernyms_1[lcs_candidate]
                lcs_d2 = 0
                if lcs_candidate in hypernyms_2:
                    lcs_d2 = hypernyms_2[lcs_candidate]
                lcs_dists.append(max([lcs_d1, lcs_d2]))
            h_dist = max(lcs_dists)
        else:
            h_dist = 0
    return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) / 
        (math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))
    

In [8]:
def word_similarity(word_1, word_2):
    synset_pair = get_best_synset_pair(word_1, word_2)
    return (length_dist(synset_pair[0], synset_pair[1]) * 
        hierarchy_dist(synset_pair[0], synset_pair[1]))


In [9]:
def most_similar_word(word, word_set):
    max_sim = -1.0
    sim_word = ""
    for ref_word in word_set:
      sim = word_similarity(word, ref_word)
      if sim > max_sim:
          max_sim = sim
          sim_word = ref_word
    return sim_word, max_sim

In [10]:
brown_freqs = dict()
N=0
def info_content(lookup_word):
    
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if word not in brown_freqs:
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if lookup_word not in brown_freqs else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))


In [11]:
def semantic_vector(words, joint_words, info_content_norm):
    sent_set = set(words)
    semvec = np.zeros(len(joint_words))
    i = 0
    for joint_word in joint_words:
        if joint_word in sent_set:
            # if word in union exists in the sentence, s(i) = 1 (unnormalized)
            semvec[i] = 1.0
            if info_content_norm:
                semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)
        else:
            # find the most similar word in the joint set and set the sim value
            sim_word, max_sim = most_similar_word(joint_word, sent_set)
            semvec[i] = PHI if max_sim > PHI else 0.0
            if info_content_norm:
                semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)
        i = i + 1
    return semvec       

In [14]:
WORD = re.compile(r"\w+")

def get_cosine(sentence_1, sentence_2):
    sentence_1 = re.sub('[^A-Za-z0-9\s]', '', sentence_1).lower()
    sentence_2 = re.sub('[^A-Za-z0-9\s]', '', sentence_2).lower()
    info_content_norm = True
    words_1 = nltk.word_tokenize(sentence_1)
    words_2 = nltk.word_tokenize(sentence_2)
    joint_words = set(words_1).union(set(words_2))
    vec_1 = semantic_vector(words_1, joint_words, info_content_norm)
    vec_2 = semantic_vector(words_2, joint_words, info_content_norm)
    return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))


In [15]:
text1 = "This is a foo bar sentence ."
text2 = "This sentence is similar to a foo bar sentence ."

#vector1 = text_to_vector(text1)
#vector2 = text_to_vector(text2)

cosine = get_cosine(text1, text2)

print("Cosine:", cosine)

Cosine: 0.9497113451849513


In [16]:
def get_relevant_indices(query,title_list):
    
    cosine_list=np.empty(len(title_list))
    for i in range(len(title_list)):
        cosine_val_i=get_cosine(query,title_list[i])
        cosine_list[i]=cosine_val_i
    
    relevant_indices=cosine_list.argsort()[-10:][::-1]
    return relevant_indices

In [17]:
def get_relevant_titles(query):
    titles=data['question_title'].values.tolist()
    relevant_indices=get_relevant_indices(query,titles)
    relevant_titles=[]
    for i in range(len(relevant_indices)):
        relevant_titles.append(titles[relevant_indices[i]])
    
    relevant_titles_df=pd.DataFrame({'Indices':relevant_indices,
                                    'Titles':relevant_titles})
    return relevant_titles_df

In [18]:
relevant_titles=get_relevant_titles("SEZS IN MAHARASHTRA")
print(relevant_titles)

print('\n\n For better visualization')
print('\n\nRelevant Indices: \n')
print(relevant_titles['Indices'].values)

print('\nRelevant Titles:\n')
print(relevant_titles['Titles'].values)

   Indices                                             Titles
0        7                              SEZS IN MAHARASHTRA .
1       98               PROBLEMS OF FARMERS OF MAHARASHTRA .
2       14          EXTENSION OF TIME TO DEVELOPERS OF SEZS .
3       16  SEZS CONVERTED INTO COMMERCIAL REAL ESTATE OPE...
4        5           REQUESTS FOR DENOTIFYING APPROVED SEZS .
5      171             IMPROVEMENT OF ROAD NETWORK IN GUJARAT
6      161                      SETTING UP OF FAMILY COURTS .
7      240                    SETTING UP OF FAST TRACK COURTS
8       55                           PRICE VARIATION OF DRUGS
9      199  . OPENING OF RETAIL CENTRES BY CORPORATE HOUSE...


 For better visualization


Relevant Indices: 

[  7  98  14  16   5 171 161 240  55 199]

Relevant Titles:

['SEZS IN MAHARASHTRA .' 'PROBLEMS OF FARMERS OF MAHARASHTRA .'
 'EXTENSION OF TIME TO DEVELOPERS OF SEZS .'
 'SEZS CONVERTED INTO COMMERCIAL REAL ESTATE OPERATIONS .'
 'REQUESTS FOR DENOTIFYING APPROVED SEZ