In [2]:
## Import Libraries and Load Data

import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import gensim.downloader as api
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize 
import nltk
import re
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')




In [3]:
data=pd.read_csv("dataset/rajyasabha_questions_and_answers_2009.csv")

In [4]:
data['question_title'] = data['question_title'].str.replace(" ","")
descriptions = data['question_title']  

<h1>Formatting the Data</h1>
In order to train a word2vec model, all of the description data will need to be concatenated into one giant string.

In [5]:
corpus_raw = ""
for description in descriptions:
    corpus_raw += description
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

Next, we need to tokenize the word corpus using NLTK. This process will essentially break the word corpus into an array of sentences stripping out less usefull characters like commas and hyphens in the process. In this way, we are able to train the word2vec model with the context of sentences and relative word placement.

In [6]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [7]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [8]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))
print(sentences)

[['SPURTINPRICESOFGOLD', 'OPENINGOFTRADECENTRESINLATINAMERICANCOUNTRIES', 'EARLYEXITOFCHINESEBUSINESSMENFROMTRADEFAIR', 'DONATIONBYSTCANDMMTCTOSTUDENTWINGSOFPOLITICALPARTIES', 'ENVISAGEDEXPORTEARNINGTARGETS', 'REQUESTSFORDENOTIFYINGAPPROVEDSEZS', 'TRADEBETWEENINDIAANDRUSSIA', 'SEZSINMAHARASHTRA', 'DONATIONBYSTCMMTCTONSUI', 'ANTIDUMPINGCASESREGISTEREDBYDGAD', 'EXPORTOFRICEANDVEGETABLES', 'MECHANISMTOREGULATEIMPORTEDGMPACKAGEDFOODS', 'INDIANPRODUCTSUNCOMPETITIVEINWORLDMARKET', 'RELIEFPACKAGETOEXPORTERSEXTENSIONOFTIMETODEVELOPERSOFSEZS', 'THMINISTERIALCONFERENCEOFWTOATGENEVA', 'SEZSCONVERTEDINTOCOMMERCIALREALESTATEOPERATIONS', 'FINANCIALASSISTANCETOTEAEXPORTERS', 'INTERNATIONALPOLICYNETWORKREPORTSONFAKEMEDICINES', 'NEWFOREIGNTRADEPOLICY', 'MINORITYSTATUSFORCENTRALUNIVERSITIES', 'MODERNIZATIONOFAUTONOMOUSMINORITYCOLLEGES', 'MINIMUMMARKSFORJEE', 'SEATSFORDISADVANTAGEDCHILDRENINMINORITYSCHOOLS', 'VISVABHARATIUNIVERSITY', 'CENTRALUNIVERSITIESINNORTHEASTERNSTATES', 'SETTINGUPOFCENTRALMADARSABO

In [9]:
model = Word2Vec(window = 1, sg = 0, hs = 0,size=10,
                 negative = 10, # for negative sampling
                 alpha=0.2, min_alpha=0.0007,
                 seed = 500,min_count=1)

In [10]:
model.build_vocab(sentences)

In [11]:
print('Word2Vec vocabulary:', model.wv.vocab)

Word2Vec vocabulary: {'SPURTINPRICESOFGOLD': <gensim.models.keyedvectors.Vocab object at 0x121F75B0>, 'OPENINGOFTRADECENTRESINLATINAMERICANCOUNTRIES': <gensim.models.keyedvectors.Vocab object at 0x121F71F0>, 'EARLYEXITOFCHINESEBUSINESSMENFROMTRADEFAIR': <gensim.models.keyedvectors.Vocab object at 0x121F7490>, 'DONATIONBYSTCANDMMTCTOSTUDENTWINGSOFPOLITICALPARTIES': <gensim.models.keyedvectors.Vocab object at 0x121F7610>, 'ENVISAGEDEXPORTEARNINGTARGETS': <gensim.models.keyedvectors.Vocab object at 0x121F7650>, 'REQUESTSFORDENOTIFYINGAPPROVEDSEZS': <gensim.models.keyedvectors.Vocab object at 0x121F7690>, 'TRADEBETWEENINDIAANDRUSSIA': <gensim.models.keyedvectors.Vocab object at 0x121F76B0>, 'SEZSINMAHARASHTRA': <gensim.models.keyedvectors.Vocab object at 0x121F76D0>, 'DONATIONBYSTCMMTCTONSUI': <gensim.models.keyedvectors.Vocab object at 0x121F7630>, 'ANTIDUMPINGCASESREGISTEREDBYDGAD': <gensim.models.keyedvectors.Vocab object at 0x121F7670>, 'EXPORTOFRICEANDVEGETABLES': <gensim.models.keyed

In [12]:
print('Word2Vec vocabulary:', model.wv.vocab['INCOMETAXSLABSFORINDUSTRIES'].index)

Word2Vec vocabulary: 190


In [13]:
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

(600, 970)

In [15]:
similar_titles=[]
similar_titles=model.most_similar('INCOMETAXSLABSFORINDUSTRIES')
print(similar_titles)

[('BANNINGCRIMINALSANDCORRUPTSPERSONSFROMELECTIONS', 0.7694000601768494), ('TRADEBETWEENINDIAANDRUSSIA', 0.7536619901657104), ('DISINVESTMENTSINPSUSPENALTYFORREPAYINGLOANINADVANCE', 0.6195030808448792), ('ENVISAGEDEXPORTEARNINGTARGETS', 0.6084926128387451), ('DONATIONBYSTCANDMMTCTOSTUDENTWINGSOFPOLITICALPARTIES', 0.5980116724967957), ('ISSUEOFVOTERIDENTITYCARDS', 0.5798826217651367), ('REMOVALOFTOXICWASTEGENERATEDBYUNIONCARBIDEPRICEVARIATIONOFDRUGSAIDFORBHOPALGASTRAGEDYCOMMONCODEOFETHICSFORPHARMACEUTICALINDUSTRY', 0.5682671070098877), ('UNHYGIENICPACKAGEDDRINKINGWATERBOTTLES', 0.5594140887260437), ('CORPORATELOANSFACILITYFORRURALPEOPLE', 0.5582634210586548), ('STATUSOFLAWCOMMISSIONSREPORTS', 0.52913498878479)]
