In [24]:
import pandas as pd
import numpy as np
import string
import re
import random 
import os
import csv

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from imblearn.ensemble import BalancedRandomForestClassifier

import gensim
import gensim.downloader as gensim_api
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt

# Tf-idf RF

In [27]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
annotated = pd.read_csv("All_annotated_data_inclusion.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # pipeline BOW, tfidf and RF
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier(n_estimators=5000, max_depth=3,max_features='sqrt', random_state=0, class_weight="balanced")),
                                                 ])
      ## Fit the model to the training set
    text_clf = text_clf.fit(train['text'], train['pred_class'])
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test['text'])
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(test['true_class'], preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(test['true_class'],  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(test['true_class'],text_clf.predict(test['text']) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.8008760107816711
PR AUC: 0.3769929657730592
AUC: 0.8194070080862534
PR AUC: 0.4423001129680347
AUC: 0.8432961837217156
PR AUC: 0.34822579834716283
AUC: 0.8692722371967655
PR AUC: 0.3395916838914933
AUC: 0.8365902964959568
PR AUC: 0.2654298378987005
AUC: 0.8700889339187211
PR AUC: 0.38275414391609686
AUC: 0.8176100628930817
PR AUC: 0.2812387616868936
AUC: 0.8460242587601078
PR AUC: 0.3919382258010612
AUC: 0.7820080862533693
PR AUC: 0.3299094982329371
AUC: 0.8190700808625336
PR AUC: 0.31370830668970834
AUC: 0.8605200945626479
PR AUC: 0.34683880123571453
AUC: 0.8222447371383541
PR AUC: 0.43622051156480424
AUC: 0.8490566037735848
PR AUC: 0.3136191888081996
AUC: 0.7574123989218329
PR AUC: 0.34711870937300077
AUC: 0.7756064690026954
PR AUC: 0.2751365413994735
AUC: 0.7687556154537287
PR AUC: 0.26890974772573195
AUC: 0.8548966756513926
PR AUC: 0.40202110023195003
AUC: 0.9156558849955075
PR AUC: 0.46912516237637475
AUC: 0.7930143755615453
PR AUC: 0.24599736419801382
AUC: 0.79997753818508

In [28]:
with open("inclusion_pr_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("inclusion_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Local Word2Vec RF

In [129]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")
annotated = pd.read_csv("All_annotated_data_round_1_inclusion_20220209.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
avg_embeddings = loadtxt('avg_embeddings.csv', delimiter=',')

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # attach train/test set to original df
    train_id = train['speech_par_id']
    test_id = test['speech_par_id']    
    dat['test'] = np.where(dat['speech_par_id'].isin(test_id), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train_id), 1, 0)
    # get the embeddings to train/test set
    train_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(train_id)].index.tolist()])
    test_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(test_id)].index.tolist()])
    ## Fit the model to the training set
    text_clf = RandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, max_features = 0.2, class_weight="balanced")
    text_clf.fit(train_vec,  dat[dat['train']==1].pred_class) # use the original df to match the order
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test_vec)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class,text_clf.predict(test_vec) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.6692558820218395
PR AUC: 0.08835575939315282
AUC: 0.7770664869721473
PR AUC: 0.14093415898329587
AUC: 0.7492138364779874
PR AUC: 0.19642432709541502
AUC: 0.6686834733893557
PR AUC: 0.126081337609296
AUC: 0.6676750700280112
PR AUC: 0.10438285687734145
AUC: 0.7645797598627787
PR AUC: 0.1535680754926754
AUC: 0.7619815173006662
PR AUC: 0.11619906060024518
AUC: 0.5785207700101317
PR AUC: 0.08032107636866795
AUC: 0.76258020938872
PR AUC: 0.16309325601911492
AUC: 0.6670028011204482
PR AUC: 0.20604149006221517
AUC: 0.7413522012578617
PR AUC: 0.15093338333603548
AUC: 0.701707097933513
PR AUC: 0.1528667389561688
AUC: 0.521344537815126
PR AUC: 0.08596608329849231
AUC: 0.7509648370497428
PR AUC: 0.12541543027869212
AUC: 0.7574690567648314
PR AUC: 0.10932070052897928
AUC: 0.7625668449197861
PR AUC: 0.14144319116966866
AUC: 0.6786273194723899
PR AUC: 0.12810965515660594
AUC: 0.7306828391734052
PR AUC: 0.1562987203493045
AUC: 0.8515537670467248
PR AUC: 0.21218947550572068
AUC: 0.73347593582887

In [130]:
with open("inclusion_pr_auc_w2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("inclusion_auc_w2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Pretrained Word2Vec


In [131]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")
annotated = pd.read_csv("All_annotated_data_round_1_inclusion_20220209.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
avg_embeddings = loadtxt('avg_embeddings_pretrained.csv', delimiter=',')

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
# attach train/test set to original df
    train_id = train['speech_par_id']
    test_id = test['speech_par_id']    
    dat['test'] = np.where(dat['speech_par_id'].isin(test_id), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train_id), 1, 0)
    # get the embeddings to train/test set
    train_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(train_id)].index.tolist()])
    test_vec = np.asarray([avg_embeddings[i] for i in dat[dat['speech_par_id'].isin(test_id)].index.tolist()])
    ## Fit the model to the training set
    text_clf = RandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, max_features = "sqrt", class_weight="balanced")
    text_clf.fit(train_vec,  dat[dat['train']==1].pred_class) # use the original df to match the order
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test_vec)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class,text_clf.predict(test_vec) )
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.6637397275695148
PR AUC: 0.08615386104540723
AUC: 0.7759433962264151
PR AUC: 0.20542096200643092
AUC: 0.7288858939802336
PR AUC: 0.24590209849945247
AUC: 0.6886274509803921
PR AUC: 0.1474433526939402
AUC: 0.6804481792717086
PR AUC: 0.18419200431771193
AUC: 0.7480703259005146
PR AUC: 0.16501373794986982
AUC: 0.708575112830432
PR AUC: 0.13712901679071246
AUC: 0.5725543172351683
PR AUC: 0.15288261273322176
AUC: 0.703253405381065
PR AUC: 0.09610424827566443
AUC: 0.6431372549019608
PR AUC: 0.29340541948044613
AUC: 0.6851976639712488
PR AUC: 0.1537888440145659
AUC: 0.6115229110512129
PR AUC: 0.07494482884463063
AUC: 0.5845378151260503
PR AUC: 0.05847761815393647
AUC: 0.6740994854202401
PR AUC: 0.11914535176395663
AUC: 0.7373026034997866
PR AUC: 0.13217424365986336
AUC: 0.7188235294117648
PR AUC: 0.10872932471306232
AUC: 0.6825396825396826
PR AUC: 0.12440212854308672
AUC: 0.6331985624438454
PR AUC: 0.18370575610973305
AUC: 0.8057232282584396
PR AUC: 0.14675381414223052
AUC: 0.809411764

In [132]:
with open("inclusion_pr_auc_pretrained.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("inclusion_auc_pretrained.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# RFC

In [133]:
auc_scores_d2v = []
pr_auc = []
accuracy_scores_d2v = []

dat = pd.read_csv("20201115_all_paragraphs.csv")
dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)

model = Word2Vec.load("doc2vec_wordvecs.model") 
annotated = pd.read_csv("All_annotated_data_round_1_inclusion_20220209.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    gbc = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=0, max_features = "sqrt", class_weight="balanced")
    ## Fit the model to the training set
    gbc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = gbc.predict_proba(test_set)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_d2v = auc_scores_d2v + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    accuracy_d2v = metrics.accuracy_score(dat[dat['test']==1].pred_class, gbc.predict(test_set))
    accuracy_scores_d2v = accuracy_scores_d2v + [accuracy_d2v]
    print("Accuracy: " + str(accuracy_d2v))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class, gbc.predict(test_set))
print(confusion)

AUC: 0.7757514353259034
Accuracy: 0.9527027027027027
PR_AUC: 0.1381569452361355
AUC: 0.762915543575921
Accuracy: 0.952808988764045
PR_AUC: 0.1589216940398971
AUC: 0.7470799640610961
Accuracy: 0.952808988764045
PR_AUC: 0.19291021921282245
AUC: 0.7892436974789916
Accuracy: 0.952914798206278
PR_AUC: 0.23916942696900031
AUC: 0.6732773109243697
Accuracy: 0.952914798206278
PR_AUC: 0.15138945534655146
AUC: 0.7162307032590052
Accuracy: 0.9506726457399103
PR_AUC: 0.12315999166886088
AUC: 0.7295293359123146
Accuracy: 0.950561797752809
PR_AUC: 0.09889346703706488
AUC: 0.7066306428008555
Accuracy: 0.9527027027027027
PR_AUC: 0.12702033388004305
AUC: 0.7893729595857255
Accuracy: 0.9527027027027027
PR_AUC: 0.13986321264331655
AUC: 0.6994957983193277
Accuracy: 0.952914798206278
PR_AUC: 0.13389499152107892
AUC: 0.798292902066487
Accuracy: 0.952808988764045
PR_AUC: 0.21966865073839412
AUC: 0.7046043003489812
Accuracy: 0.9527027027027027
PR_AUC: 0.10486898147303236
AUC: 0.5930532212885153
Accuracy: 0.952

In [134]:
with open("inclusion_pr_auc_d2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])
    
    
with open("inclusion_auc_d2v.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_d2v:
        csvwriter.writerow([row])

In [135]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))
print("Mean PR_AUC: " + str(np.mean(pr_auc)))


Mean ROC-AUC: 0.7381259827826955
Mean Accuracy: 0.952155519103859
Mean PR_AUC: 0.15170157347425703


# balanced RFC

In [136]:
auc_scores_balanced = []
pr_auc_balanced = []
accuracy_scores_balanced = []

dat = pd.read_csv("20201115_all_paragraphs.csv")
dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
model = Word2Vec.load("doc2vec_wordvecs.model") 
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")
annotated = pd.read_csv("All_annotated_data_round_1_inclusion_20220209.csv")


for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    brfc = BalancedRandomForestClassifier(n_estimators=5000, max_depth=3, random_state=0, max_features = "sqrt")
    ## Fit the model to the training set
    brfc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = brfc.predict_proba(test_set)
    fpr, tpr, thresholds = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_balanced = auc_scores_balanced + [metrics.auc(fpr, tpr)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc_balanced = pr_auc_balanced + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr, tpr)))
    accuracy_balanced = metrics.accuracy_score(dat[dat['test']==1].pred_class, brfc.predict(test_set))
    accuracy_scores_balanced = accuracy_scores_balanced + [accuracy_balanced]
    print("Accuracy: " + str(accuracy_balanced))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))

AUC: 0.7368006304176516
Accuracy: 0.7364864864864865
PR_AUC: 0.10946197371301054
AUC: 0.7926774483378256
Accuracy: 0.7146067415730337
PR_AUC: 0.21245099477727236
AUC: 0.761455525606469
Accuracy: 0.750561797752809
PR_AUC: 0.24573932862484185
AUC: 0.7759103641456582
Accuracy: 0.7219730941704036
PR_AUC: 0.17982567705172448
AUC: 0.6899719887955182
Accuracy: 0.6860986547085202
PR_AUC: 0.16593029797321024
AUC: 0.7749785591766724
Accuracy: 0.7107623318385651
PR_AUC: 0.17328563813972384
AUC: 0.7303889963464432
Accuracy: 0.7191011235955056
PR_AUC: 0.1288904752354525
AUC: 0.6697061803444782
Accuracy: 0.7252252252252253
PR_AUC: 0.1267287972071411
AUC: 0.7981537768771811
Accuracy: 0.7364864864864865
PR_AUC: 0.15972674685399302
AUC: 0.7247058823529411
Accuracy: 0.7354260089686099
PR_AUC: 0.27719100662275503
AUC: 0.7955974842767297
Accuracy: 0.7213483146067415
PR_AUC: 0.22564895013186698
AUC: 0.713272543059777
Accuracy: 0.740990990990991
PR_AUC: 0.10572932722377369
AUC: 0.630812324929972
Accuracy: 0

In [138]:
with open("inclusion_pr_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc_balanced:
        csvwriter.writerow([row])    
    
with open("inclusion_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_balanced:
        csvwriter.writerow([row])

In [137]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))
print("Mean PR_AUC: " + str(np.mean(pr_auc_balanced)))


Mean ROC-AUC: 0.7518112694595251
Mean Accuracy: 0.723710646130607
Mean PR_AUC: 0.17996602265290093


In [126]:
## check the annotation data
dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")
not_aligned = pd.DataFrame(columns = dat.columns)
for id in annotated['speech_par_id']:
    if id in dat['speech_par_id'].tolist():
        if dat.loc[dat['speech_par_id']== id,'text'].values[0] != annotated.loc[annotated["speech_par_id"] == id, 'text'].values[0]:
            print(dat[dat['speech_par_id']== id])
            print(annotated[annotated["speech_par_id"] == id])
            not_aligned = not_aligned.append(dat[dat['speech_par_id']== id])

  interactivity=interactivity, compiler=compiler, result=result)


       Unnamed: 0 Speech_id  \
36012       36013      1092   

                                                    text party    term   comp  \
36012  I don't know how thick your hide is.  But it's...   rep  1952.0  False   

      populist_old_keywords  par_id speech_par_id  
36012                 False      14       1092_14  
                                                 text  pred_class  \
38  I don't know how thick your hide is.  But it's...           0   

   speech_par_id speech_par_id_old  
38       1092_14           1092_14  
       Unnamed: 0 Speech_id  \
25704       25705      1108   

                                                    text party    term  comp  \
25704  Mr. Chairman, ladies and gentlemen:  If I cann...   rep  1952.0  True   

      populist_old_keywords  par_id speech_par_id  
25704                 False       0        1108_0  
                                                 text  pred_class  \
45  Mr. Chairman, ladies and gentlemen:  If I cann...       

       Unnamed: 0 Speech_id  \
42053       42054      1424   

                                                    text party    term  comp  \
42053  Now, permit me two observations. First, I offe...   rep  1956.0  True   

      populist_old_keywords  par_id speech_par_id  
42053                  True       5        1424_5  
                                                  text  pred_class  \
192  Now, permit me two observations. First, I offe...           0   

    speech_par_id speech_par_id_old  
192        1424_5            2952_9  
      Unnamed: 0 Speech_id                                               text  \
2236        2237      1605  The only war that would make less sense would ...   

     party    term   comp populist_old_keywords  par_id speech_par_id  
2236   dem  1960.0  False                  True      16       1605_16  
                                                  text  pred_class  \
237  The only war that would make less sense would ...           0   

    spe

       Unnamed: 0 Speech_id  \
34005       34006      1906   

                                                    text party    term   comp  \
34005  The statue atop the State House of Rhode Islan...   dem  1964.0  False   

      populist_old_keywords  par_id speech_par_id  
34005                 False      16       1906_16  
                                                  text  pred_class  \
384  The statue atop the State House of Rhode Islan...           1   

    speech_par_id speech_par_id_old  
384       1906_16           1906_16  
       Unnamed: 0 Speech_id  \
32675       32676      1907   

                                                    text party    term   comp  \
32675  For 175 years, through more than forty nationa...   dem  1964.0  False   

      populist_old_keywords  par_id speech_par_id  
32675                  True      16       1907_16  
                                                  text  pred_class  \
386  For 175 years, through more than forty nationa..

       Unnamed: 0                          Speech_id  \
59708       59709  2004-09-17-victory-2004-reception   

                                                    text party    term   comp  \
59708  And secondly, we've heard the rhetoric before,...   rep  2004.0  False   

      populist_old_keywords  par_id                         speech_par_id  
59708                  True      21  2004-09-17-victory-2004-reception_21  
                                                  text  pred_class  \
540  And secondly, we've heard the rhetoric before,...           0   

                            speech_par_id speech_par_id_old  
540  2004-09-17-victory-2004-reception_21            1853_7  
       Unnamed: 0                             Speech_id  \
61378       61379  2004-09-22-king-prussia-pennsylvania   

                                                    text party    term   comp  \
61378  You want this economy to continue to grow, we'...   rep  2004.0  False   

      populist_old_keywor

       Unnamed: 0                      Speech_id  \
60030       60031  2004-10-28-saginaw-michigan-0   

                                                    text party    term   comp  \
60030  Several times during the course of this campai...   rep  2004.0  False   

      populist_old_keywords  par_id                     speech_par_id  
60030                 False      22  2004-10-28-saginaw-michigan-0_22  
                                                  text  pred_class  \
712  Several times during the course of this campai...           0   

                        speech_par_id                 speech_par_id_old  
712  2004-10-28-saginaw-michigan-0_22  2004-10-28-saginaw-michigan-0_22  
       Unnamed: 0                      Speech_id  \
60054       60055  2004-10-28-saginaw-michigan-0   

                                                    text party    term   comp  \
60054  When I ran—when I was running for President 4 ...   rep  2004.0  False   

      populist_old_keywords  pa

       Unnamed: 0                         Speech_id  \
57501       57502  2012-10-05-st-petersburg-florida   

                                                    text party    term  comp  \
57501  I asked him, for instance, why with 23 million...   rep  2012.0  True   

      populist_old_keywords  par_id                       speech_par_id  
57501                 False       7  2012-10-05-st-petersburg-florida_7  
                                                  text  pred_class  \
916  I asked him, for instance, why with 23 million...           0   

                          speech_par_id                    speech_par_id_old  
916  2012-10-05-st-petersburg-florida_7  2016-10-25-coconut-creek-florida_39  
       Unnamed: 0                        Speech_id  \
56826       56827  2012-10-09-event-cuyahoga-falls   

                                                    text party    term   comp  \
56826  And number five — and number five, we're going...   rep  2012.0  False   

      pop

       Unnamed: 0 Speech_id  \
40485       40486      2073   

                                                    text party    term   comp  \
40485  I am sorry.  I may have hurt his pitching arm....   dem  1968.0  False   

      populist_old_keywords  par_id speech_par_id  
40485                 False      16       2073_16  
                                                   text  pred_class  \
1159  I am sorry.  I may have hurt his pitching arm....           0   

     speech_par_id speech_par_id_old  
1159       2073_16           2073_16  
       Unnamed: 0 Speech_id  \
46475       46476      2075   

                                                    text party    term  comp  \
46475  Now that the Detroit Tigers won the World Seri...   dem  1968.0  True   

      populist_old_keywords  par_id speech_par_id  
46475                 False      21       2075_21  
                                                   text  pred_class  \
1161  Now that the Detroit Tigers won the World Se

       Unnamed: 0 Speech_id  \
24833       24834      2197   

                                                    text party    term   comp  \
24833  I cannot presume to explain why such peculiar ...   rep  1968.0  False   

      populist_old_keywords  par_id speech_par_id  
24833                 False      17       2197_17  
                                                   text  pred_class  \
1284  I cannot presume to explain why such peculiar ...           0   

     speech_par_id speech_par_id_old  
1284       2197_17            2771_6  
       Unnamed: 0 Speech_id  \
20990       20991      2221   

                                                    text party    term   comp  \
20990  Not only have the grain exporters apparently b...   dem  1972.0  False   

      populist_old_keywords  par_id speech_par_id  
20990                 False       9        2221_9  
                                                   text  pred_class  \
1294  Not only have the grain exporters apparent

      Unnamed: 0 Speech_id                                               text  \
4900        4901      2737  A few days later, Dr. Giordano wrote an articl...   

     party    term   comp populist_old_keywords  par_id speech_par_id  
4900   dem  1984.0  False                 False      32       2737_32  
                                                   text  pred_class  \
1573  A few days later, Dr. Giordano wrote an articl...           0   

     speech_par_id speech_par_id_old  
1573       2737_32            1427_3  
      Unnamed: 0 Speech_id                                               text  \
4904        4905      2737  By the end of this decade, I want to point to ...   

     party    term  comp populist_old_keywords  par_id speech_par_id  
4904   dem  1984.0  True                 False      36       2737_36  
                                                   text  pred_class  \
1574  By the end of this decade, I want to point to ...           1   

     speech_par_id speec

       Unnamed: 0 Speech_id  \
11436       11437      2902   

                                                    text party    term  comp  \
11436  """"It is not a disgrace not to reach the star...   dem  1988.0  True   

      populist_old_keywords  par_id speech_par_id  
11436                 False       9        2902_9  
                                                   text  pred_class  \
1719  ""It is not a disgrace not to reach the stars,...           0   

     speech_par_id speech_par_id_old  
1719        2902_9            2902_9  
       Unnamed: 0 Speech_id  \
30499       30500      2952   

                                                    text party    term  comp  \
30499  Deep differences on values--my opponent vetoed...   rep  1988.0  True   

      populist_old_keywords  par_id speech_par_id  
30499                 False       9        2952_9  
                                                   text  pred_class  \
1756  Deep differences on values--my opponent vetoed

       Unnamed: 0 Speech_id  \
49099       49100      3143   

                                                    text party    term   comp  \
49099  I have laid out this agenda for America's rene...   rep  1992.0  False   

      populist_old_keywords  par_id speech_par_id  
49099                 False       9        3143_9  
                                                   text  pred_class  \
1920  I have laid out this agenda for America's rene...           0   

     speech_par_id speech_par_id_old  
1920        3143_9            3143_9  
       Unnamed: 0 Speech_id  \
50263       50264      3156   

                                                    text party    term   comp  \
50263  Governor Clinton is talking about """"Well, we...   rep  1992.0  False   

      populist_old_keywords  par_id speech_par_id  
50263                 False      11       3156_11  
                                                   text  pred_class  \
1921  Governor Clinton is talking about ""Well, 

In [127]:
not_aligned = not_aligned.merge(dat[["speech_par_id", "text"]], how = "left", on = "speech_par_id")
#not_aligned_df = pd.DataFrame(not_aligned)
not_aligned

Unnamed: 0.1,Unnamed: 0,Speech_id,text_x,party,term,comp,populist_old_keywords,par_id,speech_par_id,text_y
0,36013,1092,I don't know how thick your hide is. But it's...,rep,1952.0,False,False,14,1092_14,I don't know how thick your hide is. But it's...
1,25705,1108,"Mr. Chairman, ladies and gentlemen: If I cann...",rep,1952.0,True,False,0,1108_0,"Mr. Chairman, ladies and gentlemen: If I cann..."
2,7064,1170,"Now, that is the kind of recommendation that i...",rep,1952.0,False,False,2,1170_2,"Now, that is the kind of recommendation that i..."
3,42243,1183,Ladies and gentlemen: there is not much relati...,rep,1952.0,False,False,0,1183_0,Ladies and gentlemen: there is not much relati...
4,40295,1234,"Governor Barrett, I see there are some people ...",rep,1952.0,True,False,0,1234_0,"Governor Barrett, I see there are some people ..."
...,...,...,...,...,...,...,...,...,...,...
160,81,906,There is the federal-state-local health progra...,dem,1952.0,True,False,9,906_9,There is the federal-state-local health progra...
161,44909,921,"Well, I don't deserve any of the things you ar...",dem,1952.0,True,False,2,921_2,"Well, I don't deserve any of the things you ar..."
162,25264,944,3. By retraining men who are replaced by machi...,dem,1952.0,True,False,18,944_18,3. By retraining men who are replaced by machi...
163,18987,977,My opponent said the other day that he is one ...,dem,1952.0,False,False,11,977_11,My opponent said the other day that he is one ...


In [124]:
not_aligned[not_aligned['text_x'] == not_aligned['text_y']]


Unnamed: 0.1,Unnamed: 0,Speech_id,text_x,party,term,comp,populist_old_keywords,par_id,speech_par_id,text_y
0,36013,1092,I don't know how thick your hide is. But it's...,rep,1952.0,False,False,14,1092_14,I don't know how thick your hide is. But it's...
1,25705,1108,"Mr. Chairman, ladies and gentlemen: If I cann...",rep,1952.0,True,False,0,1108_0,"Mr. Chairman, ladies and gentlemen: If I cann..."
2,7064,1170,"Now, that is the kind of recommendation that i...",rep,1952.0,False,False,2,1170_2,"Now, that is the kind of recommendation that i..."
3,42243,1183,Ladies and gentlemen: there is not much relati...,rep,1952.0,False,False,0,1183_0,Ladies and gentlemen: there is not much relati...
4,40295,1234,"Governor Barrett, I see there are some people ...",rep,1952.0,True,False,0,1234_0,"Governor Barrett, I see there are some people ..."
...,...,...,...,...,...,...,...,...,...,...
160,81,906,There is the federal-state-local health progra...,dem,1952.0,True,False,9,906_9,There is the federal-state-local health progra...
161,44909,921,"Well, I don't deserve any of the things you ar...",dem,1952.0,True,False,2,921_2,"Well, I don't deserve any of the things you ar..."
162,25264,944,3. By retraining men who are replaced by machi...,dem,1952.0,True,False,18,944_18,3. By retraining men who are replaced by machi...
163,18987,977,My opponent said the other day that he is one ...,dem,1952.0,False,False,11,977_11,My opponent said the other day that he is one ...


In [128]:
not_aligned = []
for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_inclusion/" + "/" + filename
    test = pd.read_csv(file)
    for id in test['speech_par_id']:
        if id in dat['speech_par_id']:
            if dat[dat['speech_par_id']== id]['text'].values[0] != test[test["speech_par_id"] == id]['text'].values[0]:
                print(dat[dat['speech_par_id']== id])
                print(test[test["speech_par_id"] == id])
                not_aligned.append(dat[dat['speech_par_id']== id])

In [87]:
dat.loc[dat['speech_par_id']== annotated["speech_par_id"][1],'text'].values[0]

'We are opposing Communism abroad, where its relentless pressure seeks further to narrow the area of freedom. We are opposing it at home where its agents and converts seek to undermine our society and corrupt our government. As I have repeatedly said, the federal government must use all its resources to expose and identify Communistic activity, to keep Communists out of places of responsibility in our society, and to protect our institutions from Communist espionage, sabotage and subversion. '

In [81]:
missing_ids = []
for id in annotated['speech_par_id']:
    if id not in dat['speech_par_id'].tolist():
        missing_ids = missing_ids + [id]
        
len(missing_ids)

1

In [71]:
dat = pd.read_csv("20220131_all_paragraphs_2020_added_missings_added.csv")

In [90]:
dat[dat['speech_par_id']== "1009_30"]['text'].values[0] != annotated[annotated["speech_par_id"] == "1009_30"]['text'].values[0]

False

In [75]:
dat[dat["speech_par_id"] == "1009_30"]

Unnamed: 0.1,Unnamed: 0,Speech_id,text,party,term,comp,populist_old_keywords,par_id,speech_par_id
43817,43818,1009,"We are opposing Communism abroad, where its re...",dem,1952.0,False,False,30,1009_30


In [80]:
annotated['speech_par_id'][1] in dat['speech_par_id'].tolist()

True

In [78]:
annotated['speech_par_id'][1] 

'1009_30'