In [1]:
### --- Import Necessary Packages --- ###
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import text
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve,auc, roc_auc_score, accuracy_score, precision_recall_curve, auc, f1_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from gensim import models
from tqdm import tqdm
import time
import os

In [2]:
### --- Define Useful Functions --- ###
def modelEval(y_true, y_preds, predict_probs):
    auroc = (roc_auc_score(y_true, predict_probs))
    accuracy = (accuracy_score(y_true, y_preds))
    f1 = f1_score(y_true, y_preds)
    precision, recall, threshold = precision_recall_curve(y_true, predict_probs)
    auprc = auc(recall, precision)
    return {"auroc": auroc, "accuracy": accuracy, "auprc": auprc, "f1_score": f1}


def getPredicts(clf, X):
    predict_probs = clf.predict_proba(X)[:, 1] #Predicted probability for the positive label
    predicts = clf.predict(X)
    return predicts, predict_probs


def retrieve_top_words(clf, vectorizer, top_k=5, top_positve_words=True):
    clf_name = clf.__class__.__name__
    if clf_name == 'LogisticRegression':
        coef_arr = np.array(clf.coef_).squeeze()
    elif clf_name == 'RandomForestClassifier':
        coef_arr = np.array(clf.feature_importances_).squeeze()
    else:
        raise (Exception('Classifier is not LR nor RF, cannot retrieve importance coef.'))

    Name_list = vectorizer.get_feature_names()
    if top_positve_words:
        # print('Retrieving Top '+str(top_k)+' words for positive samples')
        top_k_idx = coef_arr.argsort()[::-1][0:top_k]
    else:
        # print('Retrieving Top '+str(top_k)+' words for negative samples')
        top_k_idx = coef_arr.argsort()[0:top_k]
    top_k_words = []
    for idx in top_k_idx:
        top_k_words.append(Name_list[idx])
    # print(top_k_words)
    return (top_k_words)

def plot_ROC(y_true, y_pred, legend, lw):
    '''
    This function plots the ROC based on y_true and y_pred
    :param y_true: The ground truth of the samples
    :param y_pred: The predicted probablity of the samples
    :param legend: Legend of the curve
    :return: None
    '''
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    auroc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw = lw, label = legend + ' AUC=%0.3f' % auroc)

def plot_PRC(y_true, y_pred, legend, lw):
    '''
    This function plots the precision-recall curves based on y_true and y_pred
    :param y_true: The ground truth of the samples
    :param y_pred: The predicted probablity of the samples
    :param legend: Legend of the curve
    :return: None
    '''
    pre, rec, _ = precision_recall_curve(y_true, y_pred)
    auroc = auc(rec, pre)
    plt.plot(rec, pre, lw = lw, label = legend + ' AUC=%0.3f' % auroc)

class w2v_vectorizer():

    def __init__(self, wv):
        '''
        This functin initialize the word2vec model
        :param wv: a gensim word2vec model
        '''
        self.model = wv
        pass

    def transform(self,corpus):
        '''
        This function vectorize each document in the corpus
        :param corpus: pd.series of documents
        :return: feature: vectorized documents
        '''
        feature = np.empty((corpus.shape[0],200))
        sent_lst = list(corpus)
        print('W2V vectorization: get word vectors')
        vec_lst = [[self.model[word] for word in sent if word in self.model.vocab] for sent in tqdm(sent_lst)]
        print('W2V vectorization: get sentence vectors')
        for i in tqdm(list(range(feature.shape[0]))):
            feature[i,:] = np.mean(vec_lst[i],axis=0,keepdims=False) # continuous bag-of-word with average pooling
        return feature


def ErrorAnalysis(preds,GT_label,GT_text):
    '''
    Return DataFrame of FN and FP texts and indexes

    args:
    preds: predicted labels
    GT_label: ground truth labels
    val_text: original text before vectorization

    output:
    Error_DF: DataFrame that contains FN text and index, FP text and index
    '''
    False_Neg = []
    False_Pos = []
    False_Neg_Idx = []
    False_Pos_Idx = []
    for idx in range(preds.shape[0]):
        if preds[idx] == 1 and GT_label[idx] == 0:
            False_Pos.append(GT_text[idx])
            False_Pos_Idx.append(idx)
        elif preds[idx] == 0 and GT_label[idx] == 1:
            False_Neg.append(GT_text[idx])
            False_Neg_Idx.append(idx)
    Error_dict = {'False_Positive':False_Pos,'False_Negative':False_Neg,'False_Pos_Idx':False_Pos_Idx,'False_Neg_Idx':False_Neg_Idx}
    Error_DF = pd.DataFrame.from_dict(Error_dict,orient='index').transpose()
    return Error_DF

In [3]:
### --- Bootstrapping on the 80% trainng data for non-ensemble learning models --- ###

#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'STEM_TEXT'

#Load Dataset
Train_Data, Val_Data = pd.read_csv("Data/train.csv"),pd.read_csv("Data/val.csv")
train_corpus, val_corpus= Train_Data[TEXT], Val_Data[TEXT]

#Preprocessing labels form boolean to int
Y_train, Y_val = Train_Data['label'].astype(int), Val_Data['label'].astype(int)

#Instancing classifier (SVM and MLP take a lot time to run!)
clf_dict = {
    'LR':LogisticRegression(max_iter=200)
    #,'SVM':SVC(probability=True)
    #,'RF':RandomForestClassifier()
    #,'MLP': MLPClassifier()
}

### --- BOW --- ###

#Instancing vectorizer
vectorizer_dict = {
    'BOW':text.CountVectorizer()
}

In [5]:
#select each of the vectorizer
for vect_NAME in vectorizer_dict:
    print(vect_NAME)
    vectorizer = vectorizer_dict[vect_NAME]

    #Fit clf
    if vect_NAME == 'BOW':
        clf = LogisticRegression(C = 0.1,max_iter=1000,penalty='l2')
    else:
        clf = LogisticRegression(C = 10,max_iter=1000,penalty='l2')
    
    #bootstrapping experiments: initialize the performance recorders
    vectorization_time = []
    modeling_time = []
    auroc = []
    auprc = []
    f1_score_recorder = []
    accuracy = []
    predicted_prob = []
    train_prob = []
    train_label = []
    for i in range(100):
        np.random.seed(i)
        idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
        X_train_boot = train_corpus[idx]
        Y_train_boot = Y_train[idx]
        train_label.append(Y_train_boot)
        
        
        #Fit vectorizer using train corpus
        tik = time.time()
        X_train = vectorizer.fit_transform(X_train_boot)
        X_val = vectorizer.transform(val_corpus)
        tok = time.time()
        vectorization_time.append(tok-tik)
        
        #Fit classifier using logistic regression
        tik = time.time()
        clf.fit(X_train,Y_train_boot)
        tok = time.time()
        modeling_time.append(tok-tik)
        
        
        #Make prediction and evaluate the prediction
        preds, pred_probs = getPredicts(clf, X_val)
        predicted_prob.append(pred_probs)
        result = modelEval(Y_val, preds, pred_probs)
        
        #Get training probability
        train_preds, train_pred_probs = getPredicts(clf, X_train)
        train_prob.append(train_pred_probs)
        
        auroc.append(result['auroc'])
        auprc.append(result['auprc'])
        accuracy.append(result['accuracy'])
        f1_score_recorder.append(result['f1_score'])
        
        
        

BOW


In [6]:
result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
result_df.to_csv('.//Result//Bootstrapping_'+vect_NAME+'.csv', index = False)
np.save('.//Result//Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
np.save('.//Result//Bootstrapping_predicted_probability_training_'+vect_NAME+'.npy',np.array(train_prob))
np.save('.//Result//Bootstrapping_label_training_'+vect_NAME+'.npy',np.array(train_label))
print(result_df.mean())
print(result_df.quantile(0.025))
print(result_df.quantile(0.975))

AUROC                 0.983010
AUPRC                 0.982830
Accuracy              0.944020
F1_score              0.945174
Vectorization_Time    2.630324
Modeling_Time         2.246226
dtype: float64
AUROC                 0.981517
AUPRC                 0.980647
Accuracy              0.939810
F1_score              0.941134
Vectorization_Time    2.489900
Modeling_Time         1.614048
Name: 0.025, dtype: float64
AUROC                 0.984492
AUPRC                 0.984944
Accuracy              0.948258
F1_score              0.949381
Vectorization_Time    2.823604
Modeling_Time         2.775940
Name: 0.975, dtype: float64


In [7]:
### --- TF-IDF --- ###
#Instancing vectorizer
vectorizer_dict = {
    'TFIDF':text.TfidfVectorizer()
}

#select each of the vectorizer
for vect_NAME in vectorizer_dict:
    print(vect_NAME)
    vectorizer = vectorizer_dict[vect_NAME]

    #Fit clf
    if vect_NAME == 'BOW':
        clf = LogisticRegression(C = 0.1,max_iter=1000,penalty='l2')
    else:
        clf = LogisticRegression(C = 10,max_iter=1000,penalty='l2')
    
    #bootstrapping experiments: initialize the performance recorders
    vectorization_time = []
    modeling_time = []
    auroc = []
    auprc = []
    f1_score_recorder = []
    accuracy = []
    predicted_prob = []
    train_prob = []
    train_label = []
    for i in range(100):
        np.random.seed(i)
        idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
        X_train_boot = train_corpus[idx]
        Y_train_boot = Y_train[idx]
        train_label.append(Y_train_boot)
        
        
        #Fit vectorizer using train corpus
        tik = time.time()
        X_train = vectorizer.fit_transform(X_train_boot)
        X_val = vectorizer.transform(val_corpus)
        tok = time.time()
        vectorization_time.append(tok-tik)
        
        #Fit classifier using logistic regression
        tik = time.time()
        clf.fit(X_train,Y_train_boot)
        tok = time.time()
        modeling_time.append(tok-tik)
        
        
        #Make prediction and evaluate the prediction
        preds, pred_probs = getPredicts(clf, X_val)
        predicted_prob.append(pred_probs)
        result = modelEval(Y_val, preds, pred_probs)
        
        #Get training probability
        train_preds, train_pred_probs = getPredicts(clf, X_train)
        train_prob.append(train_pred_probs)
        
        auroc.append(result['auroc'])
        auprc.append(result['auprc'])
        accuracy.append(result['accuracy'])
        f1_score_recorder.append(result['f1_score'])
    result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
    result_df.to_csv('.//Result//Bootstrapping_'+vect_NAME+'.csv', index = False)
    np.save('.//Result//Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
    np.save('.//Result//Bootstrapping_predicted_probability_training_'+vect_NAME+'.npy',np.array(train_prob))
    np.save('.//Result//Bootstrapping_label_training_'+vect_NAME+'.npy',np.array(train_label))
    print(result_df.mean())
    print(result_df.quantile(0.025))
    print(result_df.quantile(0.975))

TFIDF
AUROC                 0.987890
AUPRC                 0.988493
Accuracy              0.952974
F1_score              0.953637
Vectorization_Time    2.214791
Modeling_Time         1.817068
dtype: float64
AUROC                 0.986991
AUPRC                 0.987513
Accuracy              0.948944
F1_score              0.949411
Vectorization_Time    1.215391
Modeling_Time         0.536576
Name: 0.025, dtype: float64
AUROC                 0.988805
AUPRC                 0.989441
Accuracy              0.956741
F1_score              0.957341
Vectorization_Time    2.816845
Modeling_Time         3.499377
Name: 0.975, dtype: float64


In [11]:


#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'CLEAN_TEXT'

#Load Dataset
Train_Data, Val_Data = pd.read_csv("Data/train.csv"),pd.read_csv("Data/val.csv")
train_corpus, val_corpus= Train_Data[TEXT], Val_Data[TEXT]

#Preprocessing labels form boolean to int
Y_train, Y_val = Train_Data['label'].astype(int), Val_Data['label'].astype(int)

#Instancing classifier (SVM and MLP take a lot time to run!)
clf_dict = {
    'LR':LogisticRegression(max_iter=200)
    #,'SVM':SVC(probability=True)
    #,'RF':RandomForestClassifier()
    #,'MLP': MLPClassifier()
}

#Instancing vectorizer
vectorizer_dict = ['W2V2']

#select each of the vectorizer
for vect_NAME in vectorizer_dict:
    print(vect_NAME)

    #Fit clf
    if 'W2V' in vect_NAME:
        clf = LogisticRegression(C = 10,max_iter=1000,penalty='l2')
    else:
        clf = LogisticRegression(C = 1,max_iter=1000,penalty='l2')
    
    #bootstrapping experiments: initialize the performance recorders
    vectorization_time = []
    modeling_time = []
    auroc = []
    auprc = []
    f1_score_recorder = []
    accuracy = []
    predicted_prob = []
    
    if vect_NAME == 'W2V1':
        train_corpus = np.load('./Embeddings/W2V1_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/W2V1_CLEAN_val.npy')
    elif vect_NAME == 'S2V':
        train_corpus = np.load('./Embeddings/S2V_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/S2V_CLEAN_val.npy')
    else:
        train_corpus = np.load('./Embeddings/W2V2_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/W2V2_CLEAN_val.npy')
    
    for i in range(100):
        np.random.seed(i)
        idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
        X_train_boot = train_corpus[idx]
        Y_train_boot = Y_train[idx]
        
        
        #Fit vectorizer using train corpus
        tik = time.time()
        X_train = X_train_boot
        X_val = val_corpus
        tok = time.time()
        vectorization_time.append(tok-tik)
        
        #Fit classifier using logistic regression
        tik = time.time()
        clf.fit(X_train,Y_train_boot)
        tok = time.time()
        modeling_time.append(tok-tik)
        
        
        #Make prediction and evaluate the prediction
        preds, pred_probs = getPredicts(clf, X_val)
        predicted_prob.append(pred_probs)
        result = modelEval(Y_val, preds, pred_probs)
        
        auroc.append(result['auroc'])
        auprc.append(result['auprc'])
        accuracy.append(result['accuracy'])
        f1_score_recorder.append(result['f1_score'])
        
    result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
    result_df.to_csv('./Result/Bootstrapping_'+vect_NAME+'.csv', index = False)
    np.save('./Result/Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
    print(result_df.mean())
    print(result_df.quantile(0.025))
    print(result_df.quantile(0.975))

W2V2
AUROC                 0.980034
AUPRC                 0.981032
Accuracy              0.939271
F1_score              0.940312
Vectorization_Time    0.000004
Modeling_Time         0.983368
dtype: float64
AUROC                 0.978985
AUPRC                 0.980009
Accuracy              0.936290
F1_score              0.937239
Vectorization_Time    0.000002
Modeling_Time         0.650753
Name: 0.025, dtype: float64
AUROC                 0.981202
AUPRC                 0.982183
Accuracy              0.943198
F1_score              0.944228
Vectorization_Time    0.000005
Modeling_Time         1.232835
Name: 0.975, dtype: float64


In [11]:
#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'CLEAN_TEXT'

#Load Dataset
Train_Data, Val_Data = pd.read_csv("Data/train.csv"),pd.read_csv("Data/val.csv")
train_corpus, val_corpus= Train_Data[TEXT], Val_Data[TEXT]

#Preprocessing labels form boolean to int
Y_train, Y_val = Train_Data['label'].astype(int), Val_Data['label'].astype(int)

#Instancing classifier (SVM and MLP take a lot time to run!)
clf_dict = {
    'LR':LogisticRegression(max_iter=200)
    #,'SVM':SVC(probability=True)
    #,'RF':RandomForestClassifier()
    #,'MLP': MLPClassifier()
}

#Instancing vectorizer
vectorizer_dict = ['S2V']

#select each of the vectorizer
for vect_NAME in vectorizer_dict:
    print(vect_NAME)

    #Fit clf
    if 'W2V' in vect_NAME:
        clf = LogisticRegression(C = 10,max_iter=1000,penalty='l2')
    else:
        clf = LogisticRegression(C = 1,max_iter=1000,penalty='l2')
    
    #bootstrapping experiments: initialize the performance recorders
    vectorization_time = []
    modeling_time = []
    auroc = []
    auprc = []
    f1_score_recorder = []
    accuracy = []
    predicted_prob = []
    
    if vect_NAME == 'W2V1':
        train_corpus = np.load('./Embeddings/W2V1_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/W2V1_CLEAN_val.npy')
    elif vect_NAME == 'S2V':
        train_corpus = np.load('./Embeddings/S2V_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/S2V_CLEAN_val.npy')
    else:
        train_corpus = np.load('./Embeddings/W2V2_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/W2V2_CLEAN_val.npy')
    
    for i in range(100):
        np.random.seed(i)
        idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
        X_train_boot = train_corpus[idx]
        Y_train_boot = Y_train[idx]
        
        
        #Fit vectorizer using train corpus
        tik = time.time()
        X_train = X_train_boot
        X_val = val_corpus
        tok = time.time()
        vectorization_time.append(tok-tik)
        
        #Fit classifier using logistic regression
        tik = time.time()
        clf.fit(X_train,Y_train_boot)
        tok = time.time()
        modeling_time.append(tok-tik)
        
        
        #Make prediction and evaluate the prediction
        preds, pred_probs = getPredicts(clf, X_val)
        predicted_prob.append(pred_probs)
        result = modelEval(Y_val, preds, pred_probs)
        
        auroc.append(result['auroc'])
        auprc.append(result['auprc'])
        accuracy.append(result['accuracy'])
        f1_score_recorder.append(result['f1_score'])
        
    result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
    result_df.to_csv('./Result/Bootstrapping_'+vect_NAME+'.csv', index = False)
    np.save('./Result/Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
    print(result_df.mean())
    print(result_df.quantile(0.025))
    print(result_df.quantile(0.975))

S2V
AUROC                 0.985377
AUPRC                 0.986466
Accuracy              0.949440
F1_score              0.950300
Vectorization_Time    0.000008
Modeling_Time         0.455067
dtype: float64
AUROC                 0.984218
AUPRC                 0.985527
Accuracy              0.946498
F1_score              0.947347
Vectorization_Time    0.000003
Modeling_Time         0.360581
Name: 0.025, dtype: float64
AUROC                 0.986376
AUPRC                 0.987385
Accuracy              0.953203
F1_score              0.954104
Vectorization_Time    0.000005
Modeling_Time         0.540543
Name: 0.975, dtype: float64


In [8]:
#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'CLEAN_TEXT'

#Load Dataset
Train_Data, Val_Data = pd.read_csv("Data/train.csv"),pd.read_csv("Data/val.csv")
train_corpus, val_corpus= Train_Data[TEXT], Val_Data[TEXT]

#Preprocessing labels form boolean to int
Y_train, Y_val = Train_Data['label'].astype(int), Val_Data['label'].astype(int)

#Instancing classifier (SVM and MLP take a lot time to run!)
clf_dict = {
    'LR':LogisticRegression(max_iter=200)
    #,'SVM':SVC(probability=True)
    #,'RF':RandomForestClassifier()
    #,'MLP': MLPClassifier()
}

#Instancing vectorizer
vectorizer_dict = ['W2V1','W2V2','S2V']

#select each of the vectorizer
for vect_NAME in vectorizer_dict:
    print(vect_NAME)

    #Fit clf
    if 'W2V' in vect_NAME:
        clf = LogisticRegression(C = 10,max_iter=1000,penalty='l2')
    else:
        clf = LogisticRegression(C = 1,max_iter=1000,penalty='l2')
    
    #bootstrapping experiments: initialize the performance recorders
    vectorization_time = []
    modeling_time = []
    auroc = []
    auprc = []
    f1_score_recorder = []
    accuracy = []
    predicted_prob = []
    train_prob = []
    train_label = []
    
    if vect_NAME == 'W2V1':
        train_corpus = np.load('./Embeddings/W2V1_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/W2V1_CLEAN_val.npy')
    elif vect_NAME == 'S2V':
        train_corpus = np.load('./Embeddings/S2V_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/S2V_CLEAN_val.npy')
    else:
        train_corpus = np.load('./Embeddings/W2V2_CLEAN_train.npy')
        val_corpus = np.load('./Embeddings/W2V2_CLEAN_val.npy')
    
    for i in range(100):
        np.random.seed(i)
        idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
        X_train_boot = train_corpus[idx]
        Y_train_boot = Y_train[idx]
        train_label.append(Y_train_boot)
        
        
        #Fit vectorizer using train corpus
        tik = time.time()
        X_train = X_train_boot
        X_val = val_corpus
        tok = time.time()
        vectorization_time.append(tok-tik)
        
        #Fit classifier using logistic regression
        tik = time.time()
        clf.fit(X_train,Y_train_boot)
        tok = time.time()
        modeling_time.append(tok-tik)
        
        
        #Make prediction and evaluate the prediction
        preds, pred_probs = getPredicts(clf, X_val)
        predicted_prob.append(pred_probs)
        result = modelEval(Y_val, preds, pred_probs)
        
        #Get training probability
        train_preds, train_pred_probs = getPredicts(clf, X_train)
        train_prob.append(train_pred_probs)
        
        auroc.append(result['auroc'])
        auprc.append(result['auprc'])
        accuracy.append(result['accuracy'])
        f1_score_recorder.append(result['f1_score'])
        
    result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
    result_df.to_csv('./Result/Bootstrapping_'+vect_NAME+'.csv', index = False)
    np.save('./Result/Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
    np.save('./Result/Bootstrapping_predicted_probability_training_'+vect_NAME+'.npy',np.array(train_prob))
    print(result_df.mean())
    print(result_df.quantile(0.025))
    print(result_df.quantile(0.975))

W2V1
AUROC                 0.978091
AUPRC                 0.977870
Accuracy              0.936269
F1_score              0.937476
Vectorization_Time    0.000003
Modeling_Time         0.374294
dtype: float64
AUROC                 0.977181
AUPRC                 0.977143
Accuracy              0.932585
F1_score              0.933811
Vectorization_Time    0.000002
Modeling_Time         0.274849
Name: 0.025, dtype: float64
AUROC                 0.979008
AUPRC                 0.978689
Accuracy              0.939476
F1_score              0.940741
Vectorization_Time    0.000004
Modeling_Time         0.461916
Name: 0.975, dtype: float64
W2V2
AUROC                 0.980034
AUPRC                 0.981032
Accuracy              0.939271
F1_score              0.940312
Vectorization_Time    0.000003
Modeling_Time         0.981364
dtype: float64
AUROC                 0.978985
AUPRC                 0.980009
Accuracy              0.936290
F1_score              0.937239
Vectorization_Time    0.000002
Model

Collecting the predicted probability of  TF-IDF   1:1 !
Collecting the predicted probability of  TF-IDF   2:1 !
Collecting the predicted probability of  TF-IDF   5:1 !
Collecting the predicted probability of  W2V1   1:1 !
Collecting the predicted probability of  W2V1   2:1 !
Collecting the predicted probability of  W2V1   5:1 !
Collecting the predicted probability of  W2V2   1:1 !
Collecting the predicted probability of  W2V2   2:1 !
Collecting the predicted probability of  W2V2   5:1 !
Collecting the predicted probability of  S2V   1:1 !
Collecting the predicted probability of  S2V   2:1 !
Collecting the predicted probability of  S2V   5:1 !


In [7]:
X_train.shape

(2841, 12)

In [8]:
X_test.shape

(2841, 12)

In [14]:
clf.fit(X_train,Y_train)
preds, pred_probs = getPredicts(clf, X_test)
predicted_prob.append(pred_probs)
result = modelEval(Y_test, preds, pred_probs)
print(result)

{'auroc': 0.9893583588039407, 'accuracy': 0.9570573741640267, 'auprc': 0.9900937497350172, 'f1_score': 0.9580756013745704}


In [15]:
#Fit clf
lr_clf = LogisticRegression(C = 0.1,max_iter=1000,penalty='l2')

#bootstrapping experiments: initialize the performance recorders
vectorization_time = []
modeling_time = []
auroc = []
auprc = []
f1_score_recorder = []
accuracy = []
predicted_prob = []

for i in range(100):
    np.random.seed(i)
    idx = np.random.choice(X_train.shape[0],X_train.shape[0],replace=True)
    X_train_boot = X_train[idx]
    Y_train_boot = Y_train[idx]


    #Fit vectorizer using train corpus
    tik = time.time()
    tok = time.time()
    vectorization_time.append(tok-tik)

    #Fit classifier using logistic regression
    tik = time.time()
    clf.fit(X_train_boot,Y_train_boot)
    tok = time.time()
    modeling_time.append(tok-tik)


    #Make prediction and evaluate the prediction
    preds, pred_probs = getPredicts(clf, X_test)
    predicted_prob.append(pred_probs)
    result = modelEval(Y_test, preds, pred_probs)

    auroc.append(result['auroc'])
    auprc.append(result['auprc'])
    accuracy.append(result['accuracy'])
    f1_score_recorder.append(result['f1_score'])

#     result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
#     result_df.to_csv('./Result/Bootstrapping_'+vect_NAME+'.csv', index = False)
#     np.save('./Result/Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
print(result_df.mean())
print(result_df.quantile(0.025))
print(result_df.quantile(0.975))

Ensemble
AUROC                 0.535404
AUPRC                 0.583458
Accuracy              0.532506
F1_score              0.460525
Vectorization_Time    0.000002
Modeling_Time         0.017045
dtype: float64
AUROC                 0.031261
AUPRC                 0.315369
Accuracy              0.057471
F1_score              0.017354
Vectorization_Time    0.000001
Modeling_Time         0.011206
Name: 0.025, dtype: float64
AUROC                 0.965964
AUPRC                 0.935857
Accuracy              0.938974
F1_score              0.941261
Vectorization_Time    0.000003
Modeling_Time         0.021765
Name: 0.975, dtype: float64


In [9]:
#This is for the ensemble learning
#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'STEM_TEXT'

#Load Dataset
Train_Data, Fusion_Data, Val_Data = pd.read_csv("./Data/sep_train.csv"), pd.read_csv("./Data/fusion_train.csv"), pd.read_csv("./Data/val.csv")
train_corpus, fusion_corpus, val_corpus= Train_Data[TEXT], Fusion_Data[TEXT], Val_Data[TEXT]

#Preprocessing labels form boolean to int
Y_train, Y_fusion, Y_val = Train_Data['label'].astype(int), Fusion_Data['label'].astype(int), Val_Data['label'].astype(int)

#Instancing vectorizer
vectorizer_dict = {
    'TFIDF':text.TfidfVectorizer(),
    'W2V1': None,
    'W2V2': None,
    'S2V': None
}

#bootstrapping experiments: initialize the performance recorders
vectorization_time = []
modeling_time = []
auroc = []
auprc = []
f1_score_recorder = []
accuracy = []
predicted_prob = []
train_prob = []
train_label = []
fusion_label = []

for i in range(100):
    np.random.seed(i)
    idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
    train_corpus_boot = train_corpus.iloc[idx] #New set of training data (separate learners)
    Y_train_boot = Y_train[idx] #New set of training data (separate learners)
    train_label.append(Y_train_boot)
    fusion_label.append(Y_fusion)
    
    features_fusion = []
    features_val = []
    features_train = []
    #select each of the vectorizer
    for vect_NAME in vectorizer_dict:

        #Load the embedding for the three sets 
        if vect_NAME == 'W2V1':
            train_emb = np.load('./Embeddings/W2V1_CLEAN_train_sep.npy')[idx]
            fusion_emb = np.load('./Embeddings/W2V1_CLEAN_train_fusion.npy')
            val_emb = np.load('./Embeddings/W2V1_CLEAN_val.npy')
        elif vect_NAME == 'S2V':
            train_emb = np.load('./Embeddings/S2V_sep_CLEAN_train.npy')[idx]
            fusion_emb = np.load('./Embeddings/S2V_fusion_CLEAN_train.npy')
            val_emb = np.load('./Embeddings/S2V_CLEAN_val.npy')
        elif vect_NAME == 'W2V2':
            train_emb = np.load('./Embeddings/W2V2_CLEAN_train_sep.npy')[idx]
            fusion_emb = np.load('./Embeddings/W2V2_CLEAN_train_fusion.npy')
            val_emb = np.load('./Embeddings/W2V2_CLEAN_val.npy')
        else:
            train_emb = vectorizer_dict[vect_NAME].fit_transform(train_corpus_boot)
            fusion_emb = vectorizer_dict[vect_NAME].transform(fusion_corpus)
            val_emb = vectorizer_dict[vect_NAME].transform(val_corpus)
        
        if vect_NAME == 'S2V':
            clf1 = LogisticRegression(C = 1,max_iter=1000,penalty='l2',class_weight={1:1,0:1})
            clf2 = LogisticRegression(C = 1,max_iter=1000,penalty='l2',class_weight={1:2,0:1})
            clf3 = LogisticRegression(C = 1,max_iter=1000,penalty='l2',class_weight={1:5,0:1})
        else:
            clf1 = LogisticRegression(C = 10,max_iter=1000,penalty='l2',class_weight={1:1,0:1})
            clf2 = LogisticRegression(C = 10,max_iter=1000,penalty='l2',class_weight={1:2,0:1})
            clf3 = LogisticRegression(C = 10,max_iter=1000,penalty='l2',class_weight={1:5,0:1})
        
        clf1.fit(train_emb,Y_train_boot)
        clf2.fit(train_emb,Y_train_boot)
        clf3.fit(train_emb,Y_train_boot)
        
        features_fusion.append(clf1.predict_proba(fusion_emb)[:, 1])
        features_fusion.append(clf2.predict_proba(fusion_emb)[:, 1])
        features_fusion.append(clf3.predict_proba(fusion_emb)[:, 1])
        
        features_val.append(clf1.predict_proba(val_emb)[:, 1])
        features_val.append(clf2.predict_proba(val_emb)[:, 1])
        features_val.append(clf3.predict_proba(val_emb)[:, 1])
        
        features_train.append(clf1.predict_proba(train_emb)[:, 1])
        features_train.append(clf2.predict_proba(train_emb)[:, 1])
        features_train.append(clf3.predict_proba(train_emb)[:, 1])
    
    X_fusion = np.array(features_fusion).T
    X_val = np.array(features_val).T
    X_train = np.array(features_train).T
    
    vectorization_time.append(0)
    tik = time.time()
    clf = LogisticRegression(C = 0.1,max_iter=1000,penalty='l2',class_weight={1:1,0:1})
    clf.fit(X_fusion,Y_fusion)
    tok = time.time()
    modeling_time.append(tok-tik)
    
    preds, pred_probs = getPredicts(clf, X_val)
    predicted_prob.append(pred_probs)
    result = modelEval(Y_val, preds, pred_probs)
    
    #Get training probability
    train_preds, train_pred_probs = getPredicts(clf, np.row_stack([X_train,X_fusion]))
    train_prob.append(train_pred_probs)
    
    print(result)

    auroc.append(result['auroc'])
    auprc.append(result['auprc'])
    accuracy.append(result['accuracy'])
    f1_score_recorder.append(result['f1_score'])


result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
result_df.to_csv('./Result/Bootstrapping_'+'ensemble'+'.csv', index = False)
np.save('./Result/Bootstrapping_predicted_probability_'+'ensemble'+'.npy',np.array(predicted_prob))
np.save('./Result/Bootstrapping_predicted_probability_train_'+'ensemble'+'.npy',np.array(train_prob))
np.save('.//Result//Bootstrapping_label_training_'+vect_NAME+'.npy',np.row_stack([np.array(train_label),np.array(fusion_label)]))
print(result_df.mean())
print(result_df.quantile(0.025))
print(result_df.quantile(0.975))

{'auroc': 0.9889518865764034, 'accuracy': 0.9538894755367828, 'auprc': 0.9894425527028552, 'f1_score': 0.95499828237719}
{'auroc': 0.9886366227511428, 'accuracy': 0.9545934530095037, 'auprc': 0.9893856488602633, 'f1_score': 0.9557764826876929}
{'auroc': 0.9886762785782197, 'accuracy': 0.9545934530095037, 'auprc': 0.9894400813980319, 'f1_score': 0.9556548642145067}
{'auroc': 0.9883228460193977, 'accuracy': 0.9531854980640619, 'auprc': 0.9889926446886722, 'f1_score': 0.9543739279588337}
{'auroc': 0.988188511905175, 'accuracy': 0.9507215769095388, 'auprc': 0.9883953507953801, 'f1_score': 0.952054794520548}
{'auroc': 0.9884234726806049, 'accuracy': 0.9521295318549806, 'auprc': 0.9890979958494277, 'f1_score': 0.9533287577213453}
{'auroc': 0.9892041967761795, 'accuracy': 0.9549454417458642, 'auprc': 0.9898249773661048, 'f1_score': 0.9561343385880741}
{'auroc': 0.9879396715902681, 'accuracy': 0.9535374868004224, 'auprc': 0.9885773373205765, 'f1_score': 0.9547325102880658}
{'auroc': 0.98816917

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 8521 and the array at index 1 has size 2841