In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.deprecated.doc2vec import LabeledSentence
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
import pandas as pd
import numpy as np
import string
import re

In [2]:
 # load data
dat = pd.read_csv('NEW_all_paragraphs.csv')

# Clean Data

In [3]:
porter = PorterStemmer()
punctuation_dictionary = {s:None for s in list(string.punctuation)}
#punctuation_dictionary["-"] = "_"

punctuation_translator = str.maketrans(punctuation_dictionary)

def text_cleaner(text, punctuation_translator, stemmer):
    text = str(text).translate(punctuation_translator)
    text = text.lower()
    text = re.split(' ',text)
    text = [porter.stem(word) for word in text]
    text = " ".join(text)
    return(text)

dat["clean_text"] = dat["text"].apply(lambda x: text_cleaner(x, punctuation_translator, porter))


In [8]:
dat['Unnamed: 0'] = range(0,len(dat))
#dat.columns.values[0] = 'id'
dat = dat.rename(columns={"Unnamed: 0": "id"})

In [9]:
dat

Unnamed: 0,id,Speech_id,text,party,term,comp,par_id,clean_text
0,0,2682,"When we start talking about the economy, it's ...",rep,1980,False,1,when we start talk about the economi it best t...
1,1,2682,It's no secret which groups are hit the hardes...,rep,1980,False,2,it no secret which group ar hit the hardest by...
2,2,2682,How can our elderly who have worked so hard to...,rep,1980,True,3,how can our elderli who have work so hard to e...
3,3,2682,I believe that social security is one of this ...,rep,1980,True,4,i believ that social secur is on of thi nation...
4,4,2682,"In contrast, I am committed to an economic pro...",rep,1980,True,5,in contrast i am commit to an econom program t...
...,...,...,...,...,...,...,...,...
72374,72374,2000-08-17-national-convention-los,"I know my own imperfections. For example, I kn...",dem,2000,False,85,i know my own imperfect for exampl i know that...
72375,72375,2000-08-17-national-convention-los,But the presidency... Audience. No.,dem,2000,True,86,but the presid audienc no
72376,72376,2000-08-17-national-convention-los,Vice President Gore. But the presidency is mor...,dem,2000,True,87,vice presid gore but the presid is more than a...
72377,72377,2000-08-17-national-convention-los,There are big choices ahead and our whole futu...,dem,2000,True,88,there ar big choic ahead and our whole futur i...


In [10]:
phrases1 = Phrases(map(lambda x: x.split(), dat["clean_text"].tolist())) #bigram
phrases2 = Phrases(phrases1[map(lambda x: x.split(), dat["clean_text"].tolist())]) #trigram
dat["phrased_text"] = dat["clean_text"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))

In [1]:
dat['phrase_text'][0] #check result

NameError: name 'dat' is not defined

# Doc2Vec

In [11]:
docs = list(zip(dat["phrased_text"].tolist(), dat["id"].tolist()))


## Define an iterator to feed documents and tags to Doc2Vec
class Sentences(object):
    def __init__(self, docs):
        self.docs = docs
    def __iter__(self):
        for doc in self.docs:
            yield TaggedDocument(words=str(doc[0]).split(), tags=[doc[1]])

## Train and save models
model = Doc2Vec(Sentences(docs), vector_size=150, window=10, min_count=5, negative=10, epochs=20, dm=0, dbow_words=1)

In [12]:
model.save("doc2vec_wordvecs.model")


In [14]:
model.wv.most_similar('trump') #check the model


[('donald_trump', 0.6958023905754089),
 ('hillari_clinton', 0.5239334106445312),
 ('pizza', 0.4897196590900421),
 ('–', 0.48754024505615234),
 ('obama', 0.4846065640449524),
 ('warren_buffett', 0.4736255705356598),
 ('ok', 0.47107744216918945),
 ('stif', 0.4571569561958313),
 ('til', 0.456119179725647),
 ('donald', 0.452963262796402)]

In [None]:
model = Word2Vec.load("doc2vec_wordvecs.model") 

In [45]:

dat.label.replace('populist',1, inplace=True)
dat.label.replace('not populist',0, inplace=True)
dat.label.value_counts()

0    481
1     19
Name: label, dtype: int64

In [46]:
dat

Unnamed: 0.1,Unnamed: 0,id_column,text,label,annotated_at,par_id,Speech_id,party,term,comp,clean_text,phrased_text
0,0,0,The fact of the matter is that we find in the ...,0,2020-11-02 19:47:00,0,2192,rep,1968,True,the fact of the matter is that we find in the ...,the fact of the matter is that we find in the_...
1,1,1,The first thing you do Tuesday morning is get ...,0,2020-11-02 19:47:35,1,1987,dem,1964,False,the first thing you do tuesdai morn is get up ...,the first thing you do tuesdai morn is get up ...
2,2,2,"Governor Clinton is talking about ""Well, we re...",0,2020-11-02 19:50:18,2,3156,rep,1992,False,governor clinton is talk about well we realli ...,governor_clinton is talk_about well we realli ...
3,3,3,"In the meantime, wages have been raised and th...",0,2020-11-02 20:00:33,3,1402,rep,1956,False,in the meantim wage have been rais and the cos...,in the meantim wage have_been rais and the cos...
4,4,4,"And when people need it, they're smart enough ...",0,2020-11-02 20:02:31,4,3226,dem,1996,False,and when peopl need it theyr smart enough to f...,and when peopl need it theyr smart enough to f...
...,...,...,...,...,...,...,...,...,...,...,...,...
495,498,495,You cannot be pro-doctor and pro-patient and p...,0,2020-11-03 11:24:00,495,2004-10-26-dubuque-iowa-0,rep,2004,False,you cannot be prodoctor and propati and proper...,you cannot be prodoctor and propati and proper...
496,499,496,"We are not only, as I say, a moral countr...",0,2020-11-03 11:24:39,496,1076,dem,1952,False,we ar not onli as i sai a moral countri ...,we_ar not_onli as i sai a moral countri thi is...
497,500,497,The housing--if there is any state in the unio...,0,2020-11-03 11:25:13,497,1659,dem,1960,False,the housingif there is ani state in the union ...,the housingif there_is ani state in the union ...
498,501,498,Don't believe for a second this election is ov...,0,2020-11-03 11:25:42,498,2008-11-03-jacksonville-florida-1,dem,2008,False,dont believ for a second thi elect is over don...,dont believ for a second thi_elect is over don...
