In [202]:
import pandas as pd
import numpy as np
import string
import re
import random 
import os
import csv

from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.porter import PorterStemmer

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline


from imblearn.ensemble import BalancedRandomForestClassifier

import nltk
# nltk.download('wordnet')

## for word embedding
import gensim
import gensim.downloader as gensim_api
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt


# Tf-idf RF

In [203]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
# dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
annotated1 = pd.read_csv("All_annotated_data_round_1_auth.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")


for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # pipeline BOW, tfidf and RF
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier(n_estimators=5000, max_depth=3,max_features='sqrt', random_state=0, class_weight="balanced")),
                                                 ])
    ## Fit the model to the training set
    text_clf = text_clf.fit(train['text'], train['pred_class'])
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test['text'])
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(test['true_class'], preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(test['true_class'],  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(test['true_class'],text_clf.predict(test['text']) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.9931972789115646
PR AUC: 0.5674242424242424
AUC: 0.8945578231292517
PR AUC: 0.15192972141472239
AUC: 0.9965986394557823
PR AUC: 0.5041666666666667
AUC: 0.8090909090909092
PR AUC: 0.575508716251412
AUC: 0.8497732426303855
PR AUC: 0.08748819396240921
AUC: 0.9642857142857143
PR AUC: 0.19059068333832482
AUC: 0.9778911564625851
PR AUC: 0.19095441595441595
AUC: 0.9778911564625851
PR AUC: 0.4058531746031746
AUC: 0.8441043083900227
PR AUC: 0.2768570150075726
AUC: 0.9937641723356009
PR AUC: 0.5583333333333333
AUC: 0.973922902494331
PR AUC: 0.20149711399711398
AUC: 0.9801587301587301
PR AUC: 0.28397177419354835
AUC: 0.9897727272727272
PR AUC: 0.3092948717948718
AUC: 0.764172335600907
PR AUC: 0.088080600293397
AUC: 0.8180272108843537
PR AUC: 0.32822180208272617
AUC: 0.884920634920635
PR AUC: 0.2718755787963547
AUC: 0.8477272727272727
PR AUC: 0.07983657473672641
AUC: 0.9909297052154196
PR AUC: 0.31117424242424246
AUC: 0.9931972789115646
PR AUC: 0.5354166666666667
AUC: 0.8985260770975056
PR 

In [204]:
with open("auth_pr_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Word2Vec RF

In [143]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

lst_stopwords = nltk.corpus.stopwords.words("english")

dat["text_clean"] = dat["text"].apply(lambda x: utils_preprocess_text(x, flg_stemm=True, flg_lemm=True,lst_stopwords=lst_stopwords))


In [144]:
dat["text_clean"].shape

(75978,)

In [145]:
phrases1 = Phrases(map(lambda x: x.split(), dat["text_clean"].tolist())) #bigram
phrases2 = Phrases(phrases1[map(lambda x: x.split(), dat["text_clean"].tolist())]) #trigram
dat["phrased_text"] = dat["text_clean"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))

In [146]:
dat['phrased_text'].head()

0    start talk economi best get right point jimmi_...
1    secret group hit_hardest mr_carter inflationar...
2    elderli work_hard enjoy retir year expect surv...
3    believ social_secur one nation vital commit se...
4    contrast commit econom program reduc inflat pu...
Name: phrased_text, dtype: object

In [147]:
dat['phrased_tokens'] = dat.apply(lambda row: nltk.word_tokenize(row['phrased_text']), axis=1)

# Train a Word2Vec (averaged within paragraphs)

In [72]:
## fit Word2Vec model
nlp = gensim.models.word2vec.Word2Vec(dat['phrased_tokens'], size=150, window=10, min_count=10, negative=10)

In [113]:
# check model
nlp.wv.most_similar('tax') 

# save model
nlp.save("word2vec_wordvecs.model")

In [175]:
# start empty matrix that hosts embeddings for each para, averaged from all words in the para
# number of para x length of embedding (150)
avg_embeddings = np.zeros((len(dat['phrased_tokens']), 150))

# load model
nlp = Word2Vec.load("word2vec_wordvecs.model") 
#iterate through each para (rows of the df)
for index, row in dat.iterrows():
    # each row is a paragraph of tokens
    tokens = row["phrased_tokens"]
    # start an empty embedding matrix for all tokens in a para
    # number of tokens x 150
    tokens_embedding = np.zeros((len(tokens), 150))
    # loop through each token to delete non embedded tokens
    for i in range(0, len(tokens)):
        try:
            tokens_embedding[i] = nlp[tokens[i]] # fill the matrix with word embedding
        except:
            pass # leave as 0 if the token is not in the model
    avg_embedding = np.average(tokens_embedding, axis = 0) # average within para
    avg_embeddings[index] = avg_embedding # fill the main matrix


        



In [178]:
avg_embeddings.shape
savetxt('avg_embeddings.csv', avg_embeddings, delimiter=',')


# Local Word2Vec RF

In [197]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")

annotated1 = pd.read_csv("All_annotated_data_round_1_auth.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
avg_embeddings = loadtxt('avg_embeddings.csv', delimiter=',')

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # attach train/test set to original df
    train_id = train['speech_par_id']
    test_id = test['speech_par_id']    
    dat['test'] = np.where(dat['speech_par_id'].isin(test_id), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train_id), 1, 0)
    # get the embeddings to train/test set
    train_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(train_id)].index.tolist()])
    test_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(test_id)].index.tolist()])
    ## Fit the model to the training set
    text_clf = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0, max_features = "sqrt", class_weight="balanced")
    text_clf.fit(train_vec,  dat[dat['train']==1].pred_class) # use the original df to match the order
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test_vec)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class,text_clf.predict(test_vec) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.9903628117913832
PR AUC: 0.6388392857142857
AUC: 0.9489795918367346
PR AUC: 0.07771938374686052
AUC: 0.949546485260771
PR AUC: 0.1594644083085864
AUC: 0.990909090909091
PR AUC: 0.48080357142857144
AUC: 0.9087301587301587
PR AUC: 0.32587926183382987
AUC: 0.8667800453514739
PR AUC: 0.05765155020653751
AUC: 0.9302721088435375
PR AUC: 0.3713685277821604
AUC: 0.8843537414965986
PR AUC: 0.06412006176042784
AUC: 0.8168934240362812
PR AUC: 0.3571861456409572
AUC: 0.9648526077097506
PR AUC: 0.16079801980964772
AUC: 0.9098639455782314
PR AUC: 0.5138514051429026
AUC: 0.8985260770975056
PR AUC: 0.039183049985077475
AUC: 0.9761363636363636
PR AUC: 0.15021653398201384
AUC: 0.9178004535147393
PR AUC: 0.10697286100684232
AUC: 0.7522675736961452
PR AUC: 0.02790658664850475
AUC: 0.903061224489796
PR AUC: 0.07837903929101751
AUC: 0.9267045454545455
PR AUC: 0.41942040396638586
AUC: 0.9659863945578231
PR AUC: 0.5451526357833073
AUC: 0.9965986394557823
PR AUC: 0.6889880952380952
AUC: 0.87755102040816

In [198]:
with open("auth_pr_auc_w2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_w2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Pretrained Word2Vec

In [191]:
# nlp = gensim_api.load("word2vec-google-news-300")

# dat["text_clean_pretrain"] = dat["text"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=False,lst_stopwords=lst_stopwords))
# dat['text_pretrain_token'] = dat.apply(lambda row: nltk.word_tokenize(row['text_clean_pretrain']), axis=1)
# # start empty matrix that hosts embeddings for each para, averaged from all words in the para
# # number of para x length of embedding (150)
# avg_embeddings = np.zeros((len(dat['text_pretrain_token']), 300))

# #iterate through each para (rows of the df)
# for index, row in dat.iterrows():
#     # each row is a paragraph of tokens
#     tokens = row["text_pretrain_token"]
#     # start an empty embedding matrix for all tokens in a para
#     # number of tokens x 150
#     tokens_embedding = np.zeros((len(tokens), 300))
#     # loop through each token to delete non embedded tokens
#     for i in range(0, len(tokens)):
#         try:
#             tokens_embedding[i] = nlp[tokens[i]] # fill the matrix with word embedding
#         except:
#             pass # leave as 0 if the token is not in the model
#     avg_embedding = np.average(tokens_embedding, axis = 0) # average within para
#     avg_embeddings[index] = avg_embedding # fill the main matrix

In [192]:
# savetxt('avg_embeddings_pretrained.csv', avg_embeddings, delimiter=',')


In [195]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
annotated1 = pd.read_csv("All_annotated_data_round_1_auth.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
avg_embeddings = loadtxt('avg_embeddings_pretrained.csv', delimiter=',')

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # attach train/test set to original df
    train_text = train['speech_par_id']
    test_text = test['speech_par_id']    
    dat['test'] = np.where(dat['speech_par_id'].isin(test_id), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train_id), 1, 0)
    # get the embeddings to train/test set
    train_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(train_id)].index.tolist()])
    test_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(test_id)].index.tolist()])
    ## Fit the model to the training set
    text_clf = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0, max_features = "sqrt", class_weight="balanced")
    text_clf.fit(train_vec,  dat[dat['train']==1].pred_class) # use the original df to match the order
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test_vec)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class,text_clf.predict(test_vec) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.9841269841269841
PR AUC: 0.2761257763975155
AUC: 0.9546485260770975
PR AUC: 0.13719918074756784
AUC: 0.9926303854875282
PR AUC: 0.528422619047619
AUC: 0.9840909090909091
PR AUC: 0.7057123655913978
AUC: 0.9427437641723357
PR AUC: 0.4257922535211267
AUC: 0.9342403628117913
PR AUC: 0.357935601425036
AUC: 0.9081632653061225
PR AUC: 0.2072977761485826
AUC: 0.8849206349206349
PR AUC: 0.36784782056054405
AUC: 0.9580498866213152
PR AUC: 0.503129076973255
AUC: 0.9750566893424035
PR AUC: 0.13950216450216452
AUC: 0.9586167800453516
PR AUC: 0.40967228252676013
AUC: 0.977891156462585
PR AUC: 0.2195487382987383
AUC: 0.99375
PR AUC: 0.5827380952380952
AUC: 0.9240362811791383
PR AUC: 0.5363298632241802
AUC: 0.9365079365079365
PR AUC: 0.11648044525883386
AUC: 0.971655328798186
PR AUC: 0.6298039215686275
AUC: 0.952840909090909
PR AUC: 0.17932349513231866
AUC: 0.9858276643990929
PR AUC: 0.5272916666666667
AUC: 0.988095238095238
PR AUC: 0.5739177489177489
AUC: 0.8594104308390024
PR AUC: 0.321359288

In [196]:
with open("auth_pr_auc_pretrained.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_pretrained.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Doc2Vec Random Forest

In [3]:
auc_scores_d2v = []
pr_auc = []
accuracy_scores_d2v = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
# dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
model = Word2Vec.load("doc2vec_wordvecs.model") 
annotated1 = pd.read_csv("All_annotated_data_round_1_auth.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    gbc = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0, max_features = "sqrt", class_weight="balanced")
    ## Fit the model to the training set
    gbc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = gbc.predict_proba(test_set)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_d2v = auc_scores_d2v + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    accuracy_d2v = metrics.accuracy_score(dat[dat['test']==1].pred_class, gbc.predict(test_set))
    accuracy_scores_d2v = accuracy_scores_d2v + [accuracy_d2v]
    print("Accuracy: " + str(accuracy_d2v))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class, gbc.predict(test_set))
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.8520408163265306
Accuracy: 0.9887640449438202
PR_AUC: 0.416724380541544
AUC: 0.945578231292517
Accuracy: 0.9865168539325843
PR_AUC: 0.11628207124774949
AUC: 0.9982993197278911
Accuracy: 0.9977528089887641
PR_AUC: 0.7666666666666666
AUC: 0.9403409090909092
Accuracy: 0.9932432432432432
PR_AUC: 0.6852176358601592
AUC: 0.9160997732426305
Accuracy: 0.9887640449438202
PR_AUC: 0.2910415517106079
AUC: 0.9501133786848073
Accuracy: 0.9865168539325843
PR_AUC: 0.3275834242589562
AUC: 0.8100907029478458
Accuracy: 0.9910112359550561
PR_AUC: 0.3067897987808168
AUC: 0.782312925170068
Accuracy: 0.9865168539325843
PR_AUC: 0.0628975343945541
AUC: 0.9336734693877551
Accuracy: 0.9887640449438202
PR_AUC: 0.28680396643783374
AUC: 0.8412698412698414
Accuracy: 0.9887640449438202
PR_AUC: 0.5111293142050152
AUC: 0.9801587301587302
Accuracy: 0.9842696629213483
PR_AUC: 0.30987903225806446
AUC: 0.935374149659864
Accuracy: 0.9910112359550561
PR_AUC: 0.17162490899955907
AUC: 1.0
Accuracy: 0.9954954954954955
PR

In [14]:
with open("auth_pr_auc.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_d2v:
        csvwriter.writerow([row])

In [4]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))
print("Mean PR_AUC: " + str(np.mean(pr_auc)))


Mean ROC-AUC: 0.8979038342609771
Mean Accuracy: 0.9891209636602895
Mean PR_AUC: 0.41920697036186366


# balanced RFC

In [7]:
auc_scores_balanced = []
pr_auc_balanced = []
accuracy_scores_balanced = []

np.random.seed(234) 
random.seed(234)

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
# dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
model = Word2Vec.load("doc2vec_wordvecs.model") 
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
annotated1 = pd.read_csv("All_annotated_data_round_1_auth.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    brfc = BalancedRandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, max_features = "sqrt")
    ## Fit the model to the training set
    brfc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = brfc.predict_proba(test_set)
    fpr, tpr, thresholds = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_balanced = auc_scores_balanced + [metrics.auc(fpr, tpr)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc_balanced = pr_auc_balanced + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr, tpr)))
    accuracy_balanced = metrics.accuracy_score(dat[dat['test']==1].pred_class, brfc.predict(test_set))
    accuracy_scores_balanced = accuracy_scores_balanced + [accuracy_balanced]
    print("Accuracy: " + str(accuracy_balanced))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.8820861678004536
Accuracy: 0.9191011235955057
PR_AUC: 0.4159594249378409
AUC: 0.95578231292517
Accuracy: 0.9325842696629213
PR_AUC: 0.12815554959250913
AUC: 0.9994331065759636
Accuracy: 0.946067415730337
PR_AUC: 0.94375
AUC: 0.9357954545454545
Accuracy: 0.9572072072072072
PR_AUC: 0.757506262894194
AUC: 0.9212018140589568
Accuracy: 0.9258426966292135
PR_AUC: 0.29208665044965354
AUC: 0.9631519274376418
Accuracy: 0.9370786516853933
PR_AUC: 0.34020008115402856
AUC: 0.7922335600907029
Accuracy: 0.9191011235955057
PR_AUC: 0.4898935174788532
AUC: 0.757936507936508
Accuracy: 0.9303370786516854
PR_AUC: 0.09778652101382089
AUC: 0.9257369614512472
Accuracy: 0.9101123595505618
PR_AUC: 0.3275864639236732
AUC: 0.8815192743764173
Accuracy: 0.9258426966292135
PR_AUC: 0.4124296584099125
AUC: 0.9784580498866214
Accuracy: 0.9438202247191011
PR_AUC: 0.28761052166224577
AUC: 0.9512471655328797
Accuracy: 0.9213483146067416
PR_AUC: 0.17970954306899703
AUC: 1.0
Accuracy: 0.9234234234234234
PR_AUC: 1.0


In [15]:
with open("auth_pr_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc_balanced:
        csvwriter.writerow([row])    

with open("auth_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_balanced:
        csvwriter.writerow([row])

In [363]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))
print("Mean PR_AUC: " + str(np.mean(pr_auc_balanced)))

Mean ROC-AUC: 0.9065323644609359
Mean Accuracy: 0.9329209434153255
Mean PR_AUC: 0.436326711684498


In [10]:
confusion = confusion_matrix(dat[dat['test']==1].pred_class, brfc.predict(test_set))
print(confusion)

[[421  20]
 [  3   1]]


In [298]:
dat = pd.read_csv('20220131_all_paragraphs_2020_added_missings_added.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [441]:
dat[dat["speech_par_id"] == "3369_11"]

Unnamed: 0.1,Unnamed: 0,Speech_id,text,party,term,comp,populist_old_keywords,par_id,speech_par_id
49468,49469,3369,"So, that was a strong economy that Bill Clinto...",rep,1996.0,False,False,11,3369_11


In [452]:
dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")
pop = pd.read_csv("All_annotated_data_populism.csv")
low_pr = pd.read_csv("All_annotated_data_round_1_low_pride.csv")
high_pr = pd.read_csv("All_annotated_data_round_1_high_pride.csv")
auth = pd.read_csv("All_annotated_data_round_1_auth.csv")
exc = pd.read_csv("All_annotated_data_exclusion_LO_recodedd.csv")
inc = pd.read_csv("All_annotated_data_inclusion.csv")

dat = dat.merge(pop[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'pop'}, axis=1)
dat = dat.merge(low_pr[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'low_pr'}, axis=1)
dat = dat.merge(high_pr[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'high_pr'}, axis=1)
dat = dat.merge(auth[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'auth'}, axis=1)
dat = dat.merge(exc[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'exc'}, axis=1)
dat = dat.merge(inc[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'inc'}, axis=1)



  interactivity=interactivity, compiler=compiler, result=result)


In [457]:
dat[dat['pop'] + dat['low_pr'] + dat['high_pr']+ dat["auth"] +dat["exc"] + dat["inc"] >1].to_csv("multiframe_pars.csv")

In [472]:
pop_pred = pd.read_csv("pop_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
auth_pred = pd.read_csv("auth_Iteration_4_predictions10239_1e-05_5_4_1_missings_merged.csv")
exc_pred = pd.read_csv("exclusion_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
inc_pred = pd.read_csv("inclusion_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
high_pr_pred = pd.read_csv("high_pride_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
low_pr_pred = pd.read_csv("low_pride_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")

In [482]:
pred_dat = pop_pred.merge(low_pr_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1_x': 'pop'}, axis=1)
pred_dat = pred_dat.rename({'Predictions_prob_1_y': 'low_pr'}, axis=1)
pred_dat = pred_dat.merge(high_pr_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'high_pr'}, axis=1)
pred_dat = pred_dat.merge(auth_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'auth'}, axis=1)
pred_dat = pred_dat.merge(exc_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'exc'}, axis=1)
pred_dat = pred_dat.merge(inc_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'inc'}, axis=1)


In [483]:
pred_dat.head()

Unnamed: 0.1,Unnamed: 0,pop,Predictions_prob_0,text,speech_par_id,uncertainty,low_pr,high_pr,auth,exc,inc
0,1,0.999835,0.000165,"The truth is, for twenty-six years in Washingt...",2008-09-28-remarks-detroit-michigan_10,0.999671,0.000284,0.000236,5.2e-05,8e-05,0.000317
1,2,0.999835,0.000165,And last minute conversions aren't going to hi...,2854_5,0.999669,0.000207,0.000222,5.6e-05,7.7e-05,0.000428
2,3,0.999834,0.000166,My opponent's first reaction to this crisis on...,2008-09-18-espanola-new-mexico_4,0.999669,0.000207,0.000575,5.1e-05,8.1e-05,0.000321
3,4,0.999834,0.000166,We saw an abandonment of the commitment to bet...,2382_1,0.999667,0.999762,0.000529,5e-05,9.2e-05,0.997079
4,5,0.999833,0.000167,"Now, I certainly don't fault Senator McCain fo...",2008-09-17-remarks-elko-nevada_10,0.999666,0.000262,0.000208,5.1e-05,8.1e-05,0.000317
