In [2]:
import pandas as pd
import numpy as np
import string
import re
import random 
import os
import csv

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

from imblearn.ensemble import BalancedRandomForestClassifier

# Tf-idf RF


In [None]:
auc_scores = []
pr_auc = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
annotated = pd.read_csv("All_annotated_data_exclusion_LO_recodedd.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_high_pride/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_high_pride/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # pipeline BOW, tfidf and RF
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier(n_estimators=500, max_depth=3,max_features='sqrt', random_state=0,class_weight="balanced")),
                        ])
    ## Fit the model to the training set
    text_clf = text_clf.fit(train['text'], train['pred_class'])
    ## Predict out-of-sample on the test set and compute AUC
    preds = text_clf.predict_proba(test['text'])
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(test['true_class'], preds[:,1], pos_label=1)
    auc_scores = auc_scores + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(test['true_class'],  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    print("PR AUC: " + str(lr_auc))
    
print("Mean AUC: " + str(np.mean(auc_scores)))
print("Mean PR AUC: " + str(np.mean(pr_auc)))

confusion = confusion_matrix(test['true_class'],text_clf.predict(test['text']) )
print(confusion)

In [None]:
with open("high_pride_pr_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("high_pride_auc_tfidf.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores:
        csvwriter.writerow([row])

# Doc2Vec RF

In [3]:
auc_scores_d2v = []
pr_auc = []
accuracy_scores_d2v = []

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
# dat['speech_par_id'] = dat['Speech_id'].astype(str) + "_" + dat['par_id'].astype(str)
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
model = Word2Vec.load("doc2vec_wordvecs.model") 
annotated = pd.read_csv("All_annotated_data_round_1_high_pride.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")

for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_high_pride/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_high_pride/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    gbc = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0, max_features = "sqrt", class_weight="balanced")
    ## Fit the model to the training set
    gbc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = gbc.predict_proba(test_set)
    fpr_d2v, tpr_d2v, thresholds_d2v = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_d2v = auc_scores_d2v + [metrics.auc(fpr_d2v, tpr_d2v)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc = pr_auc + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr_d2v, tpr_d2v)))
    accuracy_d2v = metrics.accuracy_score(dat[dat['test']==1].pred_class, gbc.predict(test_set))
    accuracy_scores_d2v = accuracy_scores_d2v + [accuracy_d2v]
    print("Accuracy: " + str(accuracy_d2v))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))

confusion = confusion_matrix(dat[dat['test']==1].pred_class, gbc.predict(test_set))
print(confusion)

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.7762988407041649
Accuracy: 0.9078651685393259
PR_AUC: 0.3284236937986967
AUC: 0.737655646200086
Accuracy: 0.9078651685393259
PR_AUC: 0.32778078494025126
AUC: 0.8115786460569628
Accuracy: 0.9123595505617977
PR_AUC: 0.31321313789989236
AUC: 0.7523257478173752
Accuracy: 0.8808988764044944
PR_AUC: 0.20165313726440898
AUC: 0.7974094747388006
Accuracy: 0.9056179775280899
PR_AUC: 0.285555874824839
AUC: 0.8130559540889527
Accuracy: 0.9099099099099099
PR_AUC: 0.3724817284873316
AUC: 0.7287820237584084
Accuracy: 0.9056179775280899
PR_AUC: 0.22774335172755625
AUC: 0.755689136968656
Accuracy: 0.9011235955056179
PR_AUC: 0.2284835341128127
AUC: 0.7840274796049806
Accuracy: 0.8831460674157303
PR_AUC: 0.20866919803414807
AUC: 0.8068555889509088
Accuracy: 0.8921348314606742
PR_AUC: 0.2800463428579879
AUC: 0.7043795620437956
Accuracy: 0.9078651685393259
PR_AUC: 0.30219831358264604
AUC: 0.8262487476742523
Accuracy: 0.9168539325842696
PR_AUC: 0.3954022476457173
AUC: 0.7789466151424074
Accuracy: 0.9

In [8]:
with open("high_pride_pr_auc.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc:
        csvwriter.writerow([row])    
    
with open("high_pride_auc.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_d2v:
        csvwriter.writerow([row])

In [4]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_d2v)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_d2v)))
print("Mean PR_AUC: " + str(np.mean(pr_auc)))


Mean ROC-AUC: 0.7852865775115458
Mean Accuracy: 0.9056744609778319
Mean PR_AUC: 0.298276579832519


# balanced RFC

In [6]:
auc_scores_balanced = []
pr_auc_balanced = []
accuracy_scores_balanced = []

np.random.seed(234) 
random.seed(234)

dat = pd.read_csv("20220115_all_paragraphs_2020_added.csv")
# X = np.asarray([model.docvecs[i] for i in label_dat.index.tolist()])
# Y = np.asarray(label_dat['label'].tolist(), dtype="int")
model = Word2Vec.load("doc2vec_wordvecs.model") 
annotated = pd.read_csv("All_annotated_data_round_1_high_pride.csv")
dat = dat.merge(annotated[["speech_par_id", "pred_class"]], how = "left", on = "speech_par_id")


for filename in os.listdir("/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_high_pride/"):
    ## load the test set 
    file = "/Users/yuchenluo/Desktop/Measure_RadicalDiscourse/codes/Prediction_of_test_high_pride/" + "/" + filename
    test = pd.read_csv(file)
    train = annotated[~annotated['speech_par_id'].isin(test["speech_par_id"])]
    test = test[test['speech_par_id'].isin(dat['speech_par_id'])]
    train = train[train['speech_par_id'].isin(dat['speech_par_id'])] # make sure annotated data and the original df match
    # merge test and train sets back onto the total df
    dat['test'] = np.where(dat['speech_par_id'].isin(test["speech_par_id"]), 1, 0)
    dat['train'] = np.where(dat['speech_par_id'].isin(train["speech_par_id"]), 1, 0)
    # so that teh order of rows are the same with the embeddings
    test_set = np.asarray([model.docvecs[i] for i in dat[dat['test'] == 1].index.tolist()])
    train_set = np.asarray([model.docvecs[i] for i in dat[dat['train']== 1].index.tolist()])
    ## Initialize a random forest classifier
    brfc = BalancedRandomForestClassifier(n_estimators=5000, max_depth=10, random_state=0, max_features = "sqrt")
    ## Fit the model to the training set
    brfc.fit(train_set, dat[dat['train']==1].pred_class)
    ## Predict out-of-sample on the test set and compute AUC
    preds = brfc.predict_proba(test_set)
    fpr, tpr, thresholds = metrics.roc_curve(dat[dat['test']==1].pred_class, preds[:,1], pos_label=1)
    auc_scores_balanced = auc_scores_balanced + [metrics.auc(fpr, tpr)]
    #PR AUC
    lr_precision, lr_recall, _ = precision_recall_curve(dat[dat['test']==1].pred_class,  preds[:,1].tolist())
    lr_auc = auc(lr_recall, lr_precision)
    pr_auc_balanced = pr_auc_balanced + [lr_auc]
    print("AUC: "+str(metrics.auc(fpr, tpr)))
    accuracy_balanced = metrics.accuracy_score(dat[dat['test']==1].pred_class, brfc.predict(test_set))
    accuracy_scores_balanced = accuracy_scores_balanced + [accuracy_balanced]
    print("Accuracy: " + str(accuracy_balanced))
    print("PR_AUC: " + str(lr_auc))

print("Mean AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))

  interactivity=interactivity, compiler=compiler, result=result)


AUC: 0.7802347216258767
Accuracy: 0.7595505617977528
PR_AUC: 0.31552716971681033
AUC: 0.7395162444539859
Accuracy: 0.7213483146067415
PR_AUC: 0.33619873789132804
AUC: 0.8267496779733792
Accuracy: 0.7348314606741573
PR_AUC: 0.33991834431150103
AUC: 0.7483898668956633
Accuracy: 0.7146067415730337
PR_AUC: 0.21691886402572802
AUC: 0.8233862888220982
Accuracy: 0.6898876404494382
PR_AUC: 0.3461727773346006
AUC: 0.8092539454806312
Accuracy: 0.6959459459459459
PR_AUC: 0.36415952182015415
AUC: 0.7544725919564907
Accuracy: 0.7168539325842697
PR_AUC: 0.25761118516842935
AUC: 0.7760841562902534
Accuracy: 0.7101123595505618
PR_AUC: 0.291597257541525
AUC: 0.8029197080291971
Accuracy: 0.7393258426966293
PR_AUC: 0.2262410586027561
AUC: 0.7983397738657507
Accuracy: 0.7280898876404495
PR_AUC: 0.32649325066788304
AUC: 0.7233433519393159
Accuracy: 0.7528089887640449
PR_AUC: 0.32157362054134886
AUC: 0.8491484184914843
Accuracy: 0.755056179775281
PR_AUC: 0.40110430385218004
AUC: 0.7917561185057965
Accuracy:

In [9]:
with open("high_pride_pr_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in pr_auc_balanced:
        csvwriter.writerow([row])    
    
with open("high_pride_auc_balanced.csv", "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in auc_scores_balanced:
        csvwriter.writerow([row])

In [7]:
print("Mean ROC-AUC: " + str(np.mean(auc_scores_balanced)))
print("Mean Accuracy: " + str(np.mean(accuracy_scores_balanced)))
print("Mean PR_AUC: " + str(np.mean(pr_auc_balanced)))

Mean ROC-AUC: 0.7959871259167722
Mean Accuracy: 0.7252011337179876
Mean PR_AUC: 0.3207349621414406
