Word embedding (word2vec) with Gensim. Train our own vectors.

In [1]:
#https://radimrehurek.com/gensim/models/word2vec.html

In [2]:
from gensim.test.utils import common_texts

In [3]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [4]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
#If you save the model you can continue training it later:
#model = Word2Vec.load("word2vec.model")

In [5]:
vector = model.wv['computer']

In [6]:
sim = model.wv.most_similar('computer', topn=10)

In [7]:
sim

[('system', 0.21617139875888824),
 ('survey', 0.04468922317028046),
 ('interface', 0.015203381888568401),
 ('time', 0.0019510635174810886),
 ('trees', -0.03284316882491112),
 ('human', -0.07424270361661911),
 ('response', -0.09317591041326523),
 ('graph', -0.09575342386960983),
 ('eps', -0.10513808578252792),
 ('user', -0.16911619901657104)]

### Challenge: What are the top ten words mentioned by Biden in the 105 congress (after stopwords removal)? For each most frequent word, find the 10 most simlar words generated using word2vec. Find the most frequent bigrams in the text. Explore some bigrams and figure out if the tokens appear in the list of most similar words.

In [8]:
#https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92
#https://github.com/zhlli1/Genism-word2vec/blob/master/Genism%20Word2Vec%20Tutorial.ipynb
#

In [63]:
def format_congress_text(text_list):
    df = pd.DataFrame(text_list, columns=["text_raw"])

    df["text_raw"] = df["text_raw"].str.split("\n</TEXT>\n</DOC>\n\n<DOC>\n<DOCNO>")

    df = df.explode("text_raw")

    df["text_raw"] = df["text_raw"].str.replace("<DOC>\n<DOCNO>", "")

    df["meta"] = df["text_raw"].str.split("</DOCNO>\n<TEXT>\n").str[0]
    df["congress"] = df["meta"].str[:3]
    df["speaker"] = df["meta"].str.split("-").str[1]
    df["state"] = df["meta"].str.split("-").str[2]
    df["date"] = df["meta"].str.split("-").str[4]

    df["text_raw"] = (
        df["text_raw"]
        .str.split("</DOCNO>\n<TEXT>\n")
        .str[1]
        .str.strip()
        .str.split(".")
        .str[2:]
        .str.join(sep="")
        .str.strip()
    )

    df["text_raw"] = df["text_raw"] + " "

    # now join this back to congress / speaker level

    df = df.groupby(["congress", "speaker"])["text_raw"].sum().reset_index()

    # drop if a congressman did not speek
    df = df.loc[lambda x: x["text_raw"].apply(type) == str]
    return df

In [64]:
import os
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")
tqdm.pandas()
droplist = open("ML-for-NLP-main/Inputs/droplist.txt", encoding="utf-8", newline="\n").read()
droplist = [i.replace('"', "") for i in droplist.split("\n")]
stop_words = stopwords.words("english")
stopwords_final = droplist + stop_words + ['s','nt', "n't", "'s", "--"]

In [65]:
text_105 = [
    open("ML-for-NLP-main/Inputs/105-extracted-date/" + i, encoding="latin").read()
    for i in os.listdir("ML-for-NLP-main/Inputs/105-extracted-date/")
]

df = format_congress_text(text_105)

In [75]:
from nltk.tokenize import word_tokenize
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('s$|ies$')
def preprocessing_text(text):
    words = word_tokenize(text.lower())
    tokens = [word for word in words if word not in string.punctuation]
    tokens = [token for token in tokens if token not in stopwords_final]
    stemmer = RegexpStemmer('s$|ies$') 
    tokens_lematized = [stemmer.stem(word) for word in tokens]
    preprocessed_text = " ".join(tokens_lematized)
    return preprocessed_text

In [67]:
df

Unnamed: 0,congress,speaker,text_raw
0,105,abraham,"Mr President, during debate on final passage o..."
1,105,akaka,"Mr President, I am pleased that the Senate pas..."
2,105,allard,"Mr President, I rise to make a few remarks con..."
3,105,ashcroft,"Mr President, the Senate is not in order I wou..."
4,105,baucus,I understand that the House has sent the Senat...
...,...,...,...
94,105,thurmond,"Mr President, as the Senate considers HR 2263,..."
95,105,torricelli,"Mr President I thank Senator Snowe, Senator Mc..."
96,105,warner,"During the past two weeks, the Senate Armed Se..."
97,105,wellstone,"Mr President, today, I would like to call atte..."


In [73]:
"""listx = df["text_raw"].values.tolist()"""

In [77]:
"""
import re
sentences = []
all_sentences=[]
for a in range(len(listx)):
    sentences = [re.sub(pattern=r"[\!'#$%&\*+,-./:;<=>?@^_`()|~=]", 
                        repl='', 
                        string=x
                       ).lower().strip().split(' ') for x in listx[a].split('\n')]
    sentences = [x for x in sentences if x != ['']]
    listx[a] = sentences
    all_sentences.append(sentences)
"""

In [78]:
"""listx[-1][-3]"""

['since',
 'oregon',
 'does',
 'not',
 'recognize',
 'common',
 'law',
 'marriage',
 'and',
 'ms',
 'barbeauquinn',
 'was',
 'not',
 'married',
 'the',
 '2',
 'years',
 'required',
 'by',
 'immigration',
 'law',
 'she',
 'has',
 'not',
 'been',
 'able',
 'to',
 'file',
 'for',
 'permanent',
 'residency',
 'in',
 'this',
 'country',
 'while',
 'i',
 'do',
 'not',
 'intend',
 'to',
 'introduce',
 'many',
 'private',
 'relief',
 'bills',
 'because',
 'of',
 'senator',
 'hatfields',
 'involvement',
 'in',
 'this',
 'matter',
 'and',
 'ms',
 'barbeauquinns',
 'compelling',
 'case',
 'i',
 'think',
 'it',
 'is',
 'appropriate',
 'that',
 'the',
 'senate',
 'pass',
 'legislation',
 'to',
 'ensure',
 'that',
 'ms',
 'barbeauquinn',
 'remains',
 'a',
 'member',
 'of',
 'the',
 'portland',
 'community',
 'for',
 'many',
 'years',
 'to',
 'come']

In [110]:
"""listx = df["text_raw"].values.tolist()"""

In [111]:
"""listxx=[]
for a in listx:
    for b in a:
        wordList = re.sub("[^\w]", " ",  preprocessing_text(b)).split()
        listxx.append(wordList)"""

KeyboardInterrupt: 

In [None]:
#len(listxx)

In [84]:
biden=df.loc[df["speaker"] == "biden", "text_raw"].item()

In [85]:
biden



In [89]:
len(biden)

1776118

In [92]:
import re
all_sentences=[]
sentences = [re.sub(pattern=r"[\!'#$%&\*+,-./:;<=>?@^_`()|~=]", 
                        repl='', 
                        string=x
                       ).lower().strip().split(' ') for x in biden.split('\n')]
sentences = [x for x in sentences if x != ['']]
sentences

[['mr',
  'president',
  'i',
  'am',
  'pleased',
  'that',
  'the',
  'senate',
  'today',
  'is',
  'passing',
  'the',
  'hatchbidenlautenberg',
  'substitute',
  'amendment',
  'to',
  'hr',
  '4164',
  'and',
  'i',
  'am',
  'hopeful',
  'that',
  'the',
  'other',
  'body',
  'will',
  'take',
  'up',
  'and',
  'pass',
  'the',
  'measure',
  'before',
  'congress',
  'adjourns',
  'for',
  'the',
  'year'],
 ['what',
  'this',
  'legislation',
  'does',
  'is',
  'simple',
  'under',
  'current',
  'federal',
  'law',
  'states',
  'must',
  'give',
  'full',
  'faith',
  'and',
  'credit',
  'to',
  'the',
  'child',
  'custody',
  'orders',
  'of',
  'another',
  'state',
  'a',
  'custody',
  'order',
  'is',
  'defined',
  'as',
  'including',
  'a',
  'visitation',
  'order',
  'however',
  'as',
  'evidence',
  'from',
  'around',
  'the',
  'country',
  'has',
  'shown',
  'state',
  'courts',
  'often',
  'do',
  'not',
  'automatically',
  'recognize',
  'visitation'

In [162]:
wordList = [re.sub("[^\w]", " ",  preprocessing_text(x)).split() for x in biden.split('\n')]

In [163]:
wordList

[['president',
  'pleased',
  'senate',
  'passing',
  'hatch',
  'biden',
  'lautenberg',
  'substitute',
  'amendment',
  'hr',
  '4164',
  'hopeful',
  'body',
  'pas',
  'measure',
  'congres',
  'adjourn'],
 ['legislation',
  'simple',
  'current',
  'federal',
  'law',
  'faith',
  'credit',
  'child',
  'custody',
  'custody',
  'defined',
  'including',
  'visitation',
  'evidence',
  'country',
  'shown',
  'court',
  'automatically',
  'recognize',
  'visitation',
  'particularly',
  'visitation',
  'child',
  'parent',
  'grandparent',
  'court',
  'supposed',
  'honor',
  'arduou',
  'proces',
  'getting'],
 ['legislation',
  'simply',
  'clarif',
  'faith',
  'credit',
  'law',
  'include',
  'visitation',
  'absolutely',
  'court',
  'visitation',
  'entered',
  'consistently',
  'provision',
  'federal',
  'faith',
  'credit',
  'statute',
  'faith',
  'credit',
  'narrow',
  'legal',
  'sense',
  'current',
  'federal',
  'law',
  'law',
  'explicit',
  'hopefully',
  '

In [100]:
biden=df.loc[df["speaker"] == "biden", "text_raw"].item()
biden = preprocessing_text(biden)

### The top ten words mentioned by Biden in the 105 congress

In [101]:
from collections import Counter
dict_counts = Counter(biden.split())
top10=sorted(dict_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
top10

[('president', 1545),
 ('senator', 1104),
 ('nato', 941),
 ('time', 689),
 ('united', 653),
 ('bill', 580),
 ('amendment', 531),
 ('nation', 500),
 ('senate', 476),
 ('colleague', 434)]

In [107]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=wordList, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

### 10 most similar words generated for each top word 

In [108]:
most_similar_words=[]
for i in range(10):
    vector = model.wv[top10[i][0]]
    sim = model.wv.most_similar(top10[i][0], topn=10)
    most_similar_words.append(sim)

In [109]:
for i in range(10):
    print(biden_top[i],":", most_similar_words[i], '\n')

president : [('madam', 0.998906672000885), ('time', 0.9988546371459961), ('friend', 0.9988160133361816), ('colleague', 0.998753547668457), ('amendment', 0.998367428779602), ('addressed', 0.9980334043502808), ('quorum', 0.9975225925445557), ('parliamentary', 0.9970330595970154), ('carolina', 0.996863603591919), ('distinguished', 0.9962255954742432)] 

senator : [('minute', 0.9964781999588013), ('floor', 0.995944082736969), ('remainder', 0.9934358596801758), ('chair', 0.988111138343811), ('thank', 0.9877412915229797), ('distinguished', 0.985099196434021), ('unanimou', 0.9842931628227234), ('time', 0.9795732498168945), ('friend', 0.9785414934158325), ('rescinded', 0.9775055050849915)] 

nato : [('european', 0.9987754821777344), ('membership', 0.998211681842804), ('central', 0.9980128407478333), ('poland', 0.9979679584503174), ('economic', 0.9978041052818298), ('military', 0.997718334197998), ('western', 0.9976797103881836), ('germany', 0.9976334571838379), ('stability', 0.9976023435592651

### The most frequent bigrams in the text

In [164]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
bigram = Phrases(wordList, min_count=1, threshold=2, delimiter=' ')
bigram_phraser = Phraser(bigram)
print(bigram_phraser)

FrozenPhrases<10810 phrases, min_count=1, threshold=2>


In [165]:
all_phrases=[]
for sent in wordList:
    tokens_ = bigram_phraser[sent]
    print(tokens_)
    all_phrases.append(tokens_)

['president pleased', 'senate passing', 'hatch biden', 'lautenberg', 'substitute amendment', 'hr', '4164', 'hopeful', 'body', 'pas', 'measure', 'congres', 'adjourn']
['legislation', 'simple', 'current federal', 'law', 'faith credit', 'child', 'custody', 'custody', 'defined', 'including', 'visitation', 'evidence', 'country', 'shown', 'court', 'automatically', 'recognize', 'visitation', 'particularly', 'visitation', 'child', 'parent', 'grandparent', 'court', 'supposed', 'honor', 'arduou', 'proces', 'getting']
['legislation', 'simply', 'clarif', 'faith credit', 'law include', 'visitation', 'absolutely', 'court', 'visitation', 'entered', 'consistently', 'provision', 'federal', 'faith credit', 'statute', 'faith credit', 'narrow', 'legal', 'sense', 'current federal', 'law law', 'explicit', 'hopefully', 'eliminate', 'hassle', 'obstacle', 'delay', 'confront', 'valid', 'visitation', 'federal law', 'followed']
['president', 'author', 'idea', 'representative', 'rob', 'andrew', 'jersey', 'deserve 

['president', 'senate begin', 'reconsideration', 'resolution ratification', 'enlargement', 'expansion', 'agree', 'strongly believe', 'mistake', 'whatever', 'scheduling', 'reason', 'brought', 'delayed', 'brought', 'hope', 'support', 'expansion oppose', 'communicated', 'respective', 'leader hope', 'intervene', 'consideration', 'debate', 'time', 'attention', 'detail', 'public', 'entitled']
['truth matter', 'folk', 'sitting', 'bar', 'devoted hundred', 'hour', 'deal', 'sudden', 'woke', 'morning', 'feel', 'expansion', 'expand', 'seriou debate', 'nato military', 'type', 'political', 'type', 'national government', 'nato nation', 'country', 'administration']
['nato observer', '25 senate', 'granted', 'unprecedented', 'acces', 'decisionmaking', 'proces', '28', 'bulk', 'participated', 'decision', 'invite invite', 'invite', 'join nato', 'europe', 'occasion', 'bet', 'half dozen', 'trip', 'nato', 'adviser', 'europe meet', 'military', 'existing nato', 'countr', 'aspirant', 'countr']
['spent time', 'co

['discussion', 'night', 'president adviser', 'convinced', 'slovenia', '1', 'candidate membership', 'round nato', 'enlargement', 'short time']
['logic', 'enlargement', 'inescapable', 'issue complex', 'remote', 'daily', 'live american', 'believe', 'critically', 'immediately', 'initiate', 'national debate', 'nato enlargement']
['foreign policy', 'matter', 'well', 'formulated', 'sustained informed', 'consent american', 'people', 'launch', 'national debate', 'explore', 'cost', 'obligation', 'benefit united', 'nato enlargement', 'chairman', 'hold hearing', 'senate foreign', 'relation committee', 'believe', 'essential', 'debate']
['meeting', 'non governmental', 'forum', 'country', 'likewise', 'essential', 'people understand', 'profound', 'importance issue']
['believe', 'examined', 'american people', 'support effort', 'enlarge alliance', 'build', 'european security', 'architecture']
['40', 'world war', 'ii', 'nato', 'bound', 'democrac western', 'europe north', 'america military', 'alliance', '

In [166]:
all_phrases = [item for sublist in all_phrases for item in sublist]
print(all_phrases)




In [167]:
bigrams=[]
for i in all_phrases:
    if len(i.split()) > 1:
        bigrams.append(i)

In [168]:
bigrams

['president pleased',
 'senate passing',
 'hatch biden',
 'substitute amendment',
 'current federal',
 'faith credit',
 'faith credit',
 'law include',
 'faith credit',
 'faith credit',
 'current federal',
 'law law',
 'federal law',
 'deserve credit',
 'issue attention',
 '1997 introduced',
 'finally thank',
 'senator willingnes',
 'bill final',
 'day session',
 'chairman staff',
 'pas bill',
 'omnibu appropriation',
 'piece legislation',
 'foreign policy',
 'produced bipartisan',
 'basi foreign',
 'relation committee',
 'foreign affair',
 'reform restructuring',
 'institutional structure',
 'funding foreign',
 'affair agenc',
 'legislation implement',
 'chemical weapon',
 'approved senate',
 'april 1997',
 'foreign affair',
 'reform restructuring',
 'original bill',
 'approved senate',
 'month ago',
 'unfortunately bill',
 'un arrear',
 'chairman agreed',
 'authorize payment',
 '926 million',
 'arrear united',
 'reform body',
 'senate approved',
 'helms biden',
 'vote 90',
 'voice vo

In [169]:
dict_counts = Counter(bigrams)
top10=sorted(dict_counts.items(), key=lambda kv: kv[1], reverse=True)[:10]
top10

[('chemical weapon', 113),
 ('foreign policy', 112),
 ('united nation', 94),
 ('nato enlargement', 91),
 ('yield floor', 84),
 ('nuclear weapon', 68),
 ('madam president', 67),
 ('foreign relation', 65),
 ('arm control', 60),
 ('world war', 51)]

In [170]:
for i in range(10):
    print(biden_top[i],":", most_similar_words[i], '\n')

president : [('madam', 0.998906672000885), ('time', 0.9988546371459961), ('friend', 0.9988160133361816), ('colleague', 0.998753547668457), ('amendment', 0.998367428779602), ('addressed', 0.9980334043502808), ('quorum', 0.9975225925445557), ('parliamentary', 0.9970330595970154), ('carolina', 0.996863603591919), ('distinguished', 0.9962255954742432)] 

senator : [('minute', 0.9964781999588013), ('floor', 0.995944082736969), ('remainder', 0.9934358596801758), ('chair', 0.988111138343811), ('thank', 0.9877412915229797), ('distinguished', 0.985099196434021), ('unanimou', 0.9842931628227234), ('time', 0.9795732498168945), ('friend', 0.9785414934158325), ('rescinded', 0.9775055050849915)] 

nato : [('european', 0.9987754821777344), ('membership', 0.998211681842804), ('central', 0.9980128407478333), ('poland', 0.9979679584503174), ('economic', 0.9978041052818298), ('military', 0.997718334197998), ('western', 0.9976797103881836), ('germany', 0.9976334571838379), ('stability', 0.9976023435592651

yes, tokens in bigrams appear in the list of most similar words. In "madam president" bigram the words "madam" is similar to word "president", in "united nation" bigram the word "united" is similar to word "nation", etc.