In [94]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.deprecated.doc2vec import LabeledSentence
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS

import pandas as pd
import numpy as np
import string
import re
import random 

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn import metrics
#from sklearn.grid_search import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, precision_recall_curve
#from sklearn.metrics import plot_precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.multiclass import unique_labels

In [57]:
 # load data
dat = pd.read_csv('NEW_all_paragraphs.csv')
label_dat = pd.read_csv('annotated_par.csv')

# merge and clean up the annotated data 
merged = label_dat.merge(dat, how = "left", on = "text")
# merged = merged[merged['Speech_id_x'] == merged['Speech_id_y']] 
del merged['Speech_id_y']
del merged['par_id_x']

merged = merged.drop_duplicates(subset = ["Speech_id_x", "par_id_y"])
len(merged['text'].unique())

duplicate = merged[merged.duplicated('text')] 
merged = merged.drop_duplicates(subset = ["text"])

del merged["Unnamed: 0_x"]
del merged["Unnamed: 0_y"]
del merged["party_y"]
del merged["term_y"]
del merged["comp_y"]

pd.options.display.max_colwidth = 200
merged[merged['text'].str.contains('Audience') ] # check where the audience reaction is 
merged = merged[~merged['id_column'].isin([51, 350])] # delete audience reactions

merged.to_csv("annotated_par.csv")

# merged = label_dat[['text','label','annotated_at', 'Speech_id','party','term','comp', 'par_id']]
# merged.to_csv("annotated_par.csv")
# merged

# Clean Data

In [3]:
porter = PorterStemmer()
punctuation_dictionary = {s:None for s in list(string.punctuation)}
#punctuation_dictionary["-"] = "_"

punctuation_translator = str.maketrans(punctuation_dictionary)

def text_cleaner(text, punctuation_translator, stemmer):
    text = str(text).translate(punctuation_translator)
    text = text.lower()
    text = re.split(' ',text)
    text = [porter.stem(word) for word in text]
    text = " ".join(text)
    return(text)

dat["clean_text"] = dat["text"].apply(lambda x: text_cleaner(x, punctuation_translator, porter))


In [8]:
dat['Unnamed: 0'] = range(0,len(dat))
#dat.columns.values[0] = 'id'
dat = dat.rename(columns={"Unnamed: 0": "id"})

In [9]:
dat

Unnamed: 0,id,Speech_id,text,party,term,comp,par_id,clean_text
0,0,2682,"When we start talking about the economy, it's ...",rep,1980,False,1,when we start talk about the economi it best t...
1,1,2682,It's no secret which groups are hit the hardes...,rep,1980,False,2,it no secret which group ar hit the hardest by...
2,2,2682,How can our elderly who have worked so hard to...,rep,1980,True,3,how can our elderli who have work so hard to e...
3,3,2682,I believe that social security is one of this ...,rep,1980,True,4,i believ that social secur is on of thi nation...
4,4,2682,"In contrast, I am committed to an economic pro...",rep,1980,True,5,in contrast i am commit to an econom program t...
...,...,...,...,...,...,...,...,...
72374,72374,2000-08-17-national-convention-los,"I know my own imperfections. For example, I kn...",dem,2000,False,85,i know my own imperfect for exampl i know that...
72375,72375,2000-08-17-national-convention-los,But the presidency... Audience. No.,dem,2000,True,86,but the presid audienc no
72376,72376,2000-08-17-national-convention-los,Vice President Gore. But the presidency is mor...,dem,2000,True,87,vice presid gore but the presid is more than a...
72377,72377,2000-08-17-national-convention-los,There are big choices ahead and our whole futu...,dem,2000,True,88,there ar big choic ahead and our whole futur i...


In [10]:
phrases1 = Phrases(map(lambda x: x.split(), dat["clean_text"].tolist())) #bigram
phrases2 = Phrases(phrases1[map(lambda x: x.split(), dat["clean_text"].tolist())]) #trigram
dat["phrased_text"] = dat["clean_text"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))

In [1]:
dat['phrase_text'][0] #check result

NameError: name 'dat' is not defined

# Doc2Vec

In [11]:
docs = list(zip(dat["phrased_text"].tolist(), dat["id"].tolist()))


## Define an iterator to feed documents and tags to Doc2Vec
class Sentences(object):
    def __init__(self, docs):
        self.docs = docs
    def __iter__(self):
        for doc in self.docs:
            yield TaggedDocument(words=str(doc[0]).split(), tags=[doc[1]])

## Train and save models
model = Doc2Vec(Sentences(docs), vector_size=150, window=10, min_count=5, negative=10, epochs=20, dm=0, dbow_words=1)

In [12]:
model.save("doc2vec_wordvecs.model")


In [14]:
model.wv.most_similar('trump') #check the model


[('donald_trump', 0.6958023905754089),
 ('hillari_clinton', 0.5239334106445312),
 ('pizza', 0.4897196590900421),
 ('–', 0.48754024505615234),
 ('obama', 0.4846065640449524),
 ('warren_buffett', 0.4736255705356598),
 ('ok', 0.47107744216918945),
 ('stif', 0.4571569561958313),
 ('til', 0.456119179725647),
 ('donald', 0.452963262796402)]

In [None]:
model = Word2Vec.load("doc2vec_wordvecs.model") 

# Classifier 


In [106]:
# 
annot_dat = pd.read_csv('annotated_par.csv')
annot_dat2= pd.read_csv('annotated_par_populism_bart.csv')
annot_dat2 = annot_dat2.rename(columns = {"Speech_id_x":"Speech_id", "party_x":'party', "term_x":"term", "comp_x":"comp", "par_id_y":"par_id"})
all_dat = pd.read_csv('20201115_all_paragraphs.csv')

annot_dat = annot_dat.append(annot_dat2, ignore_index = True)
annot_dat.label.replace('populist',1, inplace=True)
annot_dat.label.replace('not populist',0, inplace=True)
#label_dat.label.value_counts()

# generate test and training set 

annot_dat['test'] = np.random.choice([0, 1], size = len(annot_dat), p = [0.8,0.2])
clas_dat = pd.merge(all_dat, annot_dat[['Speech_id', 'par_id','label', 'test']], how='left', on=['Speech_id','par_id'])

clas_dat

Unnamed: 0.1,Unnamed: 0,Speech_id,text,party,term,comp,populist_old_keywords,par_id,label,test
0,0,2682,"When we start talking about the economy, it's best that we get right to the point. Jimmy Carter promised the American people he would give them an inflation rate of four percent and an unemploymen...",rep,1980,False,True,1,,
1,1,2682,"It's no secret which groups are hit the hardest by Mr. Carter's inflationary policies. The elderly in America have become prisoners of his totally inadequate leadership. Day by day, they are remin...",rep,1980,False,False,2,,
2,2,2682,How can our elderly who have worked so hard to enjoy their retirement years be expected to survive on limited incomes while this president and this administration do all they can to run away from ...,rep,1980,True,False,3,,
3,3,2682,I believe that social security is one of this nation's most vital commitments to our senior citizens. I will preserve and strengthen this fundamental contract between the American people and their...,rep,1980,True,True,4,,
4,4,2682,"In contrast, I am committed to an economic program to reduce inflation and put people back to work. And I would veto any attempt to tax social security benefits. But there is another injustice don...",rep,1980,True,False,5,,
...,...,...,...,...,...,...,...,...,...,...
71822,72341,2000-08-17-national-convention-los,"And I ask all of you, my fellow citizens, from this city, that marked both the end of America's journey westward and the beginning of the New Frontier, let us set out on a new journey to the best ...",dem,2000,True,False,84,,
71823,72342,2000-08-17-national-convention-los,"I know my own imperfections. For example, I know that sometimes people say I'm too serious, that I talk too much substance and policy. Maybe I've done that tonight.",dem,2000,False,False,85,,
71824,72344,2000-08-17-national-convention-los,"Vice President Gore. But the presidency is more than a popularity contest, it's a day-by-day fight for people. Sometimes you have to choose to do what's difficult or unpopular. Sometimes you have ...",dem,2000,True,False,87,,
71825,72345,2000-08-17-national-convention-los,"There are big choices ahead and our whole future is at stake. And I do have strong beliefs about it. If you entrust me with the presidency, I know I won't always be the most exciting politician. B...",dem,2000,True,False,88,,


In [109]:
clas_dat['total_id'] = clas_dat['Speech_id'].astype(str)  + '_' + clas_dat['par_id'].astype(str) 

label_dat = clas_dat[["total_id","label","test"]]


In [110]:
X_test = np.asarray([model.docvecs[i] for i in label_dat[label_dat.test==1].index.tolist()])
Y_test = np.asarray(label_dat[label_dat.test==True].label.tolist(), dtype="int")

print(X_test.shape)
print(Y_test)

(200, 150)
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0]


In [111]:
X_training = np.asarray([model.docvecs[i] for i in label_dat[label_dat.test==False].index.tolist()])
Y_training = np.asarray(label_dat[label_dat.test==False].label.tolist(), dtype="int")
print(X_training.shape)
print(Y_training)

(791, 150)
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0

In [112]:
gbc = RandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, class_weight="balanced")
gbc = CalibratedClassifierCV(gbc, cv=5, method="sigmoid")
gbc.fit(X_training, Y_training)
# gbc.fit(X, Y)

preds = gbc.predict_proba(X_test)
fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(Y_test, preds[:,1], pos_label=1)
accuracy_d2v = metrics.accuracy_score(Y_test, gbc.predict(X_test), normalize=True)

print("AUC: "+ str(metrics.auc(fpr_d2v, tpr_d2v)))
print("Accuracy: " + str(accuracy_d2v))

AUC: 0.6892307692307693
Accuracy: 0.975


Unnamed: 0.1,Unnamed: 0,text,label,annotated_at,Speech_id,party,term,comp,par_id
0,0,"The future is my responsibility and it's yours. I've only got one life on this earth to live. My integrity, my honesty, my word of honor--it's precious to me. And there would be no way that I coul...",not populist,11/4/20 15:28,2600,dem,1980,False,8.0
1,1,"I ask you to help us make new history and I ask you to help build on the record that we have already established, to build better, to build broader, to build more meaningfully, and I am going to d...",not populist,11/4/20 15:29,2005,dem,1968,True,27.0
2,2,"But, getting back to the issues before us, I suggest that we consider them tonight in terms of what you want from your government in Washington, D.C. I think the first thing that everybody would s...",not populist,11/4/20 15:29,1871,rep,1960,True,3.0
3,3,"It used to be in America all the racial issues were black and white. Now, like everything else in life, it's hard to see black and white. That's another reason we need to show up in church, to be ...",not populist,11/4/20 15:29,3256,dem,1996,False,9.0
4,4,"Mr. Lamb. Yes, it's a legal term. The President. Legal term, yes. You and I aren't lawyers.",not populist,11/4/20 15:30,2004-09-07-discussion-sedalia-missouri,rep,2004,True,47.0
...,...,...,...,...,...,...,...,...,...
492,766,"Ten months ago, in a tragic moment, I was called upon to assume the awesome responsibilities of president of this country. And I told you that afternoon, after I took the oath of office in Air For...",not populist,11/13/20 9:49,1922,dem,1964,True,21.0
493,767,"He says he cares about the middle class, but he boasts, ""I have consistently supported legislation, time after time, which increases taxes on my own constituents."" Doesn't that make you just want ...",not populist,11/13/20 9:49,2868,rep,1984,False,14.0
494,768,"We have got to have a tough plan to bring health care cost in line with inflation and provide basic health care to all of our people, and the only way you'll ever get that is to vote for Bill Clin...",not populist,11/13/20 9:49,3004,dem,1992,True,31.0
495,769,"The constant partisan rancor that stops us from solving these problems in Washington isn't a cause, it's a symptom. It's what happens when people go to Washington to work for themselves and not yo...",populist,11/13/20 9:50,2008-10-01-remarks-independence-missouri,rep,2008,True,16.0


Unnamed: 0.1,Unnamed: 0,text,label,annotated_at,Speech_id,party,term,comp,par_id,test
0,0,"The fact of the matter is that we find in the United States today the wage earners, looking over the past three years, have had an increase of between $11 and $14 a week in their pay checks, but a...",0,2020-11-02 19:47:00,2192,rep,1968,True,27.0,0
1,1,"The first thing you do Tuesday morning is get up and go and exercise the privilege that so many people in the world don't have--a private, a secret ballot. The citizens all over America will be d...",0,2020-11-02 19:47:35,1987,dem,1964,False,25.0,0
2,2,"Governor Clinton is talking about ""Well, we really need change."" He wants to put the White House in the same hands of the big spenders in Congress. The last time we had this, do you remember what ...",0,2020-11-02 19:50:18,3156,rep,1992,False,11.0,0
3,3,"In the meantime, wages have been raised and the cost of living has been extraordinarily stable in the last three years. It is true that you can say technically the cost of living is high. It has g...",0,2020-11-02 20:00:33,1402,rep,1956,False,18.0,0
4,4,"And when people need it, they're smart enough to figure out what they need, I believe; that's the reason these institutions are open. If nobody had enough sense to come to them, they would shut do...",0,2020-11-02 20:02:31,3226,dem,1996,False,41.0,0
...,...,...,...,...,...,...,...,...,...,...
987,766,"Ten months ago, in a tragic moment, I was called upon to assume the awesome responsibilities of president of this country. And I told you that afternoon, after I took the oath of office in Air For...",0,11/13/20 9:49,1922,dem,1964,True,21.0,0
988,767,"He says he cares about the middle class, but he boasts, ""I have consistently supported legislation, time after time, which increases taxes on my own constituents."" Doesn't that make you just want ...",0,11/13/20 9:49,2868,rep,1984,False,14.0,0
989,768,"We have got to have a tough plan to bring health care cost in line with inflation and provide basic health care to all of our people, and the only way you'll ever get that is to vote for Bill Clin...",0,11/13/20 9:49,3004,dem,1992,True,31.0,0
990,769,"The constant partisan rancor that stops us from solving these problems in Washington isn't a cause, it's a symptom. It's what happens when people go to Washington to work for themselves and not yo...",1,11/13/20 9:50,2008-10-01-remarks-independence-missouri,rep,2008,True,16.0,0
