In [202]:
import pandas as pd
import numpy as np
import string
import re
import random 
import os
import csv

from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.parsing.porter import PorterStemmer

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline


from imblearn.ensemble import BalancedRandomForestClassifier

import nltk
# nltk.download('wordnet')

## for word embedding
import gensim
import gensim.downloader as gensim_api
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt


# Tf-idf RF

In [203]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
# dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
annotated1 = pd.read_csv("All_annotated_data_round_1_auth.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")


for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # pipeline BOW, tfidf and RF
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier(n_estimators=5000, max_depth=3,max_features='sqrt', random_state=0, class_weight="balanced")),
                                                 ])
    ## Fit the model to the training set
    text_clf = text_clf.fit(train['text'], train['pred_class'])
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test['text'])
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(test['true_class'], preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(test['true_class'],  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(test['true_class'],text_clf.predict(test['text']) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.9931972789115646
PR AUC: 0.5674242424242424
AUC: 0.8945578231292517
PR AUC: 0.15192972141472239
AUC: 0.9965986394557823
PR AUC: 0.5041666666666667
AUC: 0.8090909090909092
PR AUC: 0.575508716251412
AUC: 0.8497732426303855
PR AUC: 0.08748819396240921
AUC: 0.9642857142857143
PR AUC: 0.19059068333832482
AUC: 0.9778911564625851
PR AUC: 0.19095441595441595
AUC: 0.9778911564625851
PR AUC: 0.4058531746031746
AUC: 0.8441043083900227
PR AUC: 0.2768570150075726
AUC: 0.9937641723356009
PR AUC: 0.5583333333333333
AUC: 0.973922902494331
PR AUC: 0.20149711399711398
AUC: 0.9801587301587301
PR AUC: 0.28397177419354835
AUC: 0.9897727272727272
PR AUC: 0.3092948717948718
AUC: 0.764172335600907
PR AUC: 0.088080600293397
AUC: 0.8180272108843537
PR AUC: 0.32822180208272617
AUC: 0.884920634920635
PR AUC: 0.2718755787963547
AUC: 0.8477272727272727
PR AUC: 0.07983657473672641
AUC: 0.9909297052154196
PR AUC: 0.31117424242424246
AUC: 0.9931972789115646
PR AUC: 0.5354166666666667
AUC: 0.8985260770975056
PR 

In [204]:
with open("auth_pr_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Word2Vec RF

In [143]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

lst_stopwords = nltk.corpus.stopwords.words("english")

dat["text_clean"] = dat["text"].apply(lambda x: utils_preprocess_text(x, flg_stemm=True, flg_lemm=True,lst_stopwords=lst_stopwords))


In [144]:
dat["text_clean"].shape

(75978,)

In [145]:
phrases1 = Phrases(map(lambda x: x.split(), dat["text_clean"].tolist())) #bigram
phrases2 = Phrases(phrases1[map(lambda x: x.split(), dat["text_clean"].tolist())]) #trigram
dat["phrased_text"] = dat["text_clean"].apply(lambda x: " ".join(phrases2[phrases1[x.split()]]))

In [146]:
dat['phrased_text'].head()

0    start talk economi best get right point jimmi_...
1    secret group hit_hardest mr_carter inflationar...
2    elderli work_hard enjoy retir year expect surv...
3    believ social_secur one nation vital commit se...
4    contrast commit econom program reduc inflat pu...
Name: phrased_text, dtype: object

In [147]:
dat['phrased_tokens'] = dat.apply(lambda row: nltk.word_tokenize(row['phrased_text']), axis=1)

# Train a Word2Vec (averaged within paragraphs)

In [72]:
## fit Word2Vec model
nlp = gensim.models.word2vec.Word2Vec(dat['phrased_tokens'], size=150, window=10, min_count=10, negative=10)

In [113]:
# check model
nlp.wv.most_similar('tax') 

# save model
nlp.save("word2vec_wordvecs.model")

In [175]:
# start empty matrix that hosts embeddings for each para, averaged from all words in the para
# number of para x length of embedding (150)
avg_embeddings = np.zeros((len(dat['phrased_tokens']), 150))

# load model
nlp = Word2Vec.load("word2vec_wordvecs.model") 
#iterate through each para (rows of the df)
for index, row in dat.iterrows():
    # each row is a paragraph of tokens
    tokens = row["phrased_tokens"]
    # start an empty embedding matrix for all tokens in a para
    # number of tokens x 150
    tokens_embedding = np.zeros((len(tokens), 150))
    # loop through each token to delete non embedded tokens
    for i in range(0, len(tokens)):
        try:
            tokens_embedding[i] = nlp[tokens[i]] # fill the matrix with word embedding
        except:
            pass # leave as 0 if the token is not in the model
    avg_embedding = np.average(tokens_embedding, axis = 0) # average within para
    avg_embeddings[index] = avg_embedding # fill the main matrix


        



In [178]:
avg_embeddings.shape
savetxt('avg_embeddings.csv', avg_embeddings, delimiter=',')


# Local Word2Vec RF

In [501]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")

annotated1 = pd.read_csv("All_annotated_data_round_1_auth_20220209.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
avg_embeddings = loadtxt('avg_embeddings.csv', delimiter=',')

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # attach train/test set to original df
    train_id = train['speech_par_id']
    test_id = test['speech_par_id']    
    dat['test'] = np.where(dat['speech_par_id'].isin(test_id), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train_id), 1, 0)
    # get the embeddings to train/test set
    train_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(train_id)].index.tolist()])
    test_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(test_id)].index.tolist()])
    ## Fit the model to the training set
    text_clf = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0, max_features = "sqrt", class_weight="balanced")
    text_clf.fit(train_vec,  dat[dat['train']==1].pred_class) # use the original df to match the order
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test_vec)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class,text_clf.predict(test_vec) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.8134920634920635
PR AUC: 0.06880285045563078
AUC: 0.9670454545454544
PR AUC: 0.3824015022675737
AUC: 0.9484126984126984
PR AUC: 0.07441325014854426
AUC: 0.9336734693877551
PR AUC: 0.14153649498076742
AUC: 0.8931818181818181
PR AUC: 0.3174445552289614
AUC: 0.9931972789115646
PR AUC: 0.6822115384615385
AUC: 0.75
PR AUC: 0.02583622166394559
AUC: 0.8741496598639457
PR AUC: 0.05100745836153926
AUC: 0.9019274376417233
PR AUC: 0.04129261852899963
AUC: 0.8713151927437642
PR AUC: 0.06722602286607539
AUC: 0.9818181818181819
PR AUC: 0.179229797979798
AUC: 0.8798185941043084
PR AUC: 0.5124690750842051
AUC: 0.9142045454545454
PR AUC: 0.07708309665622583
AUC: 0.9223356009070296
PR AUC: 0.16904357304697168
AUC: 0.9625850340136054
PR AUC: 0.15420747730530338
AUC: 0.8015873015873015
PR AUC: 0.34568201591105896
AUC: 0.9965986394557823
PR AUC: 0.6889880952380952
AUC: 0.9255681818181818
PR AUC: 0.5247532766080263
AUC: 0.9710884353741496
PR AUC: 0.5552009246088194
AUC: 0.9756235827664399
PR AUC: 0.1

In [502]:
with open("auth_pr_auc_w2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_w2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Pretrained Word2Vec

In [191]:
# nlp = gensim_api.load("word2vec-google-news-300")

# dat["text_clean_pretrain"] = dat["text"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=False,lst_stopwords=lst_stopwords))
# dat['text_pretrain_token'] = dat.apply(lambda row: nltk.word_tokenize(row['text_clean_pretrain']), axis=1)
# # start empty matrix that hosts embeddings for each para, averaged from all words in the para
# # number of para x length of embedding (150)
# avg_embeddings = np.zeros((len(dat['text_pretrain_token']), 300))

# #iterate through each para (rows of the df)
# for index, row in dat.iterrows():
#     # each row is a paragraph of tokens
#     tokens = row["text_pretrain_token"]
#     # start an empty embedding matrix for all tokens in a para
#     # number of tokens x 150
#     tokens_embedding = np.zeros((len(tokens), 300))
#     # loop through each token to delete non embedded tokens
#     for i in range(0, len(tokens)):
#         try:
#             tokens_embedding[i] = nlp[tokens[i]] # fill the matrix with word embedding
#         except:
#             pass # leave as 0 if the token is not in the model
#     avg_embedding = np.average(tokens_embedding, axis = 0) # average within para
#     avg_embeddings[index] = avg_embedding # fill the main matrix

In [192]:
# savetxt('avg_embeddings_pretrained.csv', avg_embeddings, delimiter=',')


In [499]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")
annotated1 = pd.read_csv("All_annotated_data_round_1_auth_20220209.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
avg_embeddings = loadtxt('avg_embeddings_pretrained.csv', delimiter=',')

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # attach train/test set to original df
    train_text = train['speech_par_id']
    test_text = test['speech_par_id']    
    dat['test'] = np.where(dat['speech_par_id'].isin(test_id), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train_id), 1, 0)
    # get the embeddings to train/test set
    train_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(train_id)].index.tolist()])
    test_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(test_id)].index.tolist()])
    ## Fit the model to the training set
    text_clf = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0, max_features = "sqrt", class_weight="balanced")
    text_clf.fit(train_vec,  dat[dat['train']==1].pred_class) # use the original df to match the order
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test_vec)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class,text_clf.predict(test_vec) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.9920634920634921
PR AUC: 0.37387716450216446
AUC: 0.99206

In [500]:
with open("auth_pr_auc_pretrained.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_pretrained.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Doc2Vec Random Forest

In [503]:
auc_scores_d2v = []
pr_auc = []
accuracy_scores_d2v = []

dat = pd.read_csv("20201115_all_paragraphs.csv")
dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
annotated1 = pd.read_csv("All_annotated_data_round_1_auth_20220209.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)

model = Word2Vec.load("doc2vec_wordvecs.model") 
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    gbc = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=0, max_features = "sqrt", class_weight="balanced")
    ## Fit the model to the training set
    gbc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = gbc.predict_proba(test_set)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_d2v = auc_scores_d2v + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    accuracy_d2v = metrics.accuracy_score(dat[dat['test']==1].pred_class, gbc.predict(test_set))
    accuracy_scores_d2v = accuracy_scores_d2v + [accuracy_d2v]
    print("Accuracy: " + str(accuracy_d2v))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class, gbc.predict(test_set))
print(confusion)

AUC: 0.9278409090909091
Accuracy: 0.990990990990991
PR_AUC: 0.33866672462333497
AUC: 0.9083143507972666
Accuracy: 0.9909706546275395
PR_AUC: 0.7553168883961567
AUC: 0.9552154195011338
Accuracy: 0.9887640449438202
PR_AUC: 0.12715698393902056
AUC: 0.9994331065759637
Accuracy: 0.9910112359550561
PR_AUC: 0.94375
AUC: 0.9107954545454545
Accuracy: 0.990990990990991
PR_AUC: 0.29620811466055585
AUC: 0.9937641723356009
Accuracy: 0.9910112359550561
PR_AUC: 0.8101190476190476
AUC: 0.9246031746031745
Accuracy: 0.9910112359550561
PR_AUC: 0.30775305068100794
AUC: 0.9573863636363638
Accuracy: 0.990990990990991
PR_AUC: 0.39699561403508765
AUC: 0.9478458049886621
Accuracy: 0.9910112359550561
PR_AUC: 0.2409233977070789
AUC: 0.8704545454545454
Accuracy: 0.990990990990991
PR_AUC: 0.5318037259405952
AUC: 1.0
Accuracy: 0.9909706546275395
PR_AUC: 1.0
AUC: 0.98125
Accuracy: 0.990990990990991
PR_AUC: 0.22192852437417654
AUC: 0.9390660592255125
Accuracy: 0.9909706546275395
PR_AUC: 0.30090376220140286
AUC: 0.984

In [505]:
with open("auth_pr_auc_d2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("auth_auc_d2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_d2v:
        csvwriter.writerow([row])

In [504]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))
print("Mean PR_AUC: " + str(np.mean(pr_auc)))


Mean ROC-AUC: 0.9546219929056734
Mean Accuracy: 0.9909059475180154
Mean PR_AUC: 0.49501126777827403


# balanced RFC

In [508]:
auc_scores_balanced = []
pr_auc_balanced = []
accuracy_scores_balanced = []

np.random.seed(234) 
random.seed(234)

dat = pd.read_csv("20201115_all_paragraphs.csv")
dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
model = Word2Vec.load("doc2vec_wordvecs.model") 
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
annotated1 = pd.read_csv("All_annotated_data_round_1_auth_20220209.csv")
annotated2 = pd.read_csv("All_annotated_data_round_2_auth.csv")
annotated = annotated1.append(annotated2)

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_auth/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    brfc = BalancedRandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, max_features = "sqrt")
    ## Fit the model to the training set
    brfc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = brfc.predict_proba(test_set)
    fpr, tpr, thresholds = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_balanced = auc_scores_balanced + [metrics.auc(fpr, tpr)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc_balanced = pr_auc_balanced + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr, tpr)))
    accuracy_balanced = metrics.accuracy_score(dat[dat['test']==1].pred_class, brfc.predict(test_set))
    accuracy_scores_balanced = accuracy_scores_balanced + [accuracy_balanced]
    print("Accuracy: " + str(accuracy_balanced))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))

AUC: 0.9494318181818182
Accuracy: 0.9301801801801802
PR_AUC: 0.315367724012682
AUC: 0.9555808656036446
Accuracy: 0.963882618510158
PR_AUC: 0.7607271906052394
AUC: 0.9671201814058957
Accuracy: 0.946067415730337
PR_AUC: 0.1567598505098505
AUC: 1.0
Accuracy: 0.946067415730337
PR_AUC: 1.0
AUC: 0.9357954545454545
Accuracy: 0.9301801801801802
PR_AUC: 0.3079714398711176
AUC: 0.985827664399093
Accuracy: 0.9280898876404494
PR_AUC: 0.780634236453202
AUC: 0.917233560090703
Accuracy: 0.9483146067415731
PR_AUC: 0.2986927382339074
AUC: 0.9693181818181819
Accuracy: 0.9436936936936937
PR_AUC: 0.4014520202020202
AUC: 0.953514739229025
Accuracy: 0.9303370786516854
PR_AUC: 0.2435840033994862
AUC: 0.9022727272727273
Accuracy: 0.9481981981981982
PR_AUC: 0.5236968723764499
AUC: 1.0
Accuracy: 0.9322799097065463
PR_AUC: 1.0
AUC: 0.9818181818181818
Accuracy: 0.9504504504504504
PR_AUC: 0.2951505016722408
AUC: 0.9518792710706151
Accuracy: 0.9390519187358917
PR_AUC: 0.3274140211640212
AUC: 0.9812500000000001
Accu

In [510]:
with open("auth_pr_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc_balanced:
        csvwriter.writerow([row])    

with open("auth_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_balanced:
        csvwriter.writerow([row])

In [509]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))
print("Mean PR_AUC: " + str(np.mean(pr_auc_balanced)))

Mean ROC-AUC: 0.9632317001073916
Mean Accuracy: 0.9402098388240371
Mean PR_AUC: 0.5157884783929002


In [10]:
confusion = confusion_matrix(dat[dat['test']==1].pred_class, brfc.predict(test_set))
print(confusion)

[[421  20]
 [  3   1]]


In [515]:
dat = pd.read_csv('20220131_all_paragraphs_2020_added_missings_added.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [516]:
dat[dat["speech_par_id"] == "1013_6"]

Unnamed: 0.1,Unnamed: 0,Speech_id,text,party,term,comp,populist_old_keywords,par_id,speech_par_id
10590,10591,1013,"Well, the best answer I know to Republican fea...",dem,1952.0,False,False,6,1013_6


In [517]:
missing_ids = []
for id in annotated['speech_par_id']:
    if id not in dat['speech_par_id']:
        missing_ids = missing_ids + [id]
        
len(missing_ids)

2423

In [452]:
dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")
pop = pd.read_csv("All_annotated_data_populism.csv")
low_pr = pd.read_csv("All_annotated_data_round_1_low_pride.csv")
high_pr = pd.read_csv("All_annotated_data_round_1_high_pride.csv")
auth = pd.read_csv("All_annotated_data_round_1_auth.csv")
exc = pd.read_csv("All_annotated_data_exclusion_LO_recodedd.csv")
inc = pd.read_csv("All_annotated_data_inclusion.csv")

dat = dat.merge(pop[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'pop'}, axis=1)
dat = dat.merge(low_pr[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'low_pr'}, axis=1)
dat = dat.merge(high_pr[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'high_pr'}, axis=1)
dat = dat.merge(auth[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'auth'}, axis=1)
dat = dat.merge(exc[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'exc'}, axis=1)
dat = dat.merge(inc[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
dat = dat.rename({'pred_class': 'inc'}, axis=1)



  interactivity=interactivity, compiler=compiler, result=result)


In [457]:
dat[dat['pop'] + dat['low_pr'] + dat['high_pr']+ dat["auth"] +dat["exc"] + dat["inc"] >1].to_csv("multiframe_pars.csv")

In [472]:
pop_pred = pd.read_csv("pop_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
auth_pred = pd.read_csv("auth_Iteration_4_predictions10239_1e-05_5_4_1_missings_merged.csv")
exc_pred = pd.read_csv("exclusion_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
inc_pred = pd.read_csv("inclusion_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
high_pr_pred = pd.read_csv("high_pride_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")
low_pr_pred = pd.read_csv("low_pride_Iteration_3_predictions10239_1e-05_5_4_1_missings_merged.csv")

In [482]:
pred_dat = pop_pred.merge(low_pr_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1_x': 'pop'}, axis=1)
pred_dat = pred_dat.rename({'Predictions_prob_1_y': 'low_pr'}, axis=1)
pred_dat = pred_dat.merge(high_pr_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'high_pr'}, axis=1)
pred_dat = pred_dat.merge(auth_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'auth'}, axis=1)
pred_dat = pred_dat.merge(exc_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'exc'}, axis=1)
pred_dat = pred_dat.merge(inc_pred[["speech_par_id", "Predictions_prob_1"]], how = "left", on = "speech_par_id")
pred_dat = pred_dat.rename({'Predictions_prob_1': 'inc'}, axis=1)


In [485]:
pred_dat[pred_dat['pop'] + pred_dat['low_pr'] + pred_dat['high_pr']+ pred_dat["auth"] + pred_dat["exc"] + pred_dat["inc"] >2].to_csv("multiframes_BERT.csv")