In [1]:
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import text
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve,auc, roc_auc_score, accuracy_score, precision_recall_curve, auc, f1_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from gensim import models
from tqdm import tqdm
import time
import os

In [2]:
def modelEval(y_true, y_preds, predict_probs):
    auroc = (roc_auc_score(y_true, predict_probs))
    accuracy = (accuracy_score(y_true, y_preds))
    f1 = f1_score(y_true, y_preds)
    precision, recall, threshold = precision_recall_curve(y_true, predict_probs)
    auprc = auc(recall, precision)
    return {"auroc": auroc, "accuracy": accuracy, "auprc": auprc, "f1_score": f1}


def getPredicts(clf, X):
    predict_probs = clf.predict_proba(X)[:, 1] #Predicted probability for the positive label
    predicts = clf.predict(X)
    return predicts, predict_probs


def retrieve_top_words(clf, vectorizer, top_k=5, top_positve_words=True):
    clf_name = clf.__class__.__name__
    if clf_name == 'LogisticRegression':
        coef_arr = np.array(clf.coef_).squeeze()
    elif clf_name == 'RandomForestClassifier':
        coef_arr = np.array(clf.feature_importances_).squeeze()
    else:
        raise (Exception('Classifier is not LR nor RF, cannot retrieve importance coef.'))

    Name_list = vectorizer.get_feature_names()
    if top_positve_words:
        # print('Retrieving Top '+str(top_k)+' words for positive samples')
        top_k_idx = coef_arr.argsort()[::-1][0:top_k]
    else:
        # print('Retrieving Top '+str(top_k)+' words for negative samples')
        top_k_idx = coef_arr.argsort()[0:top_k]
    top_k_words = []
    for idx in top_k_idx:
        top_k_words.append(Name_list[idx])
    # print(top_k_words)
    return (top_k_words)

def plot_ROC(y_true, y_pred, legend, lw):
    '''
    This function plots the ROC based on y_true and y_pred
    :param y_true: The ground truth of the samples
    :param y_pred: The predicted probablity of the samples
    :param legend: Legend of the curve
    :return: None
    '''
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    auroc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw = lw, label = legend + ' AUC=%0.3f' % auroc)

def plot_PRC(y_true, y_pred, legend, lw):
    '''
    This function plots the precision-recall curves based on y_true and y_pred
    :param y_true: The ground truth of the samples
    :param y_pred: The predicted probablity of the samples
    :param legend: Legend of the curve
    :return: None
    '''
    pre, rec, _ = precision_recall_curve(y_true, y_pred)
    auroc = auc(rec, pre)
    plt.plot(rec, pre, lw = lw, label = legend + ' AUC=%0.3f' % auroc)

class w2v_vectorizer():

    def __init__(self, wv):
        '''
        This functin initialize the word2vec model
        :param wv: a gensim word2vec model
        '''
        self.model = wv
        pass

    def transform(self,corpus):
        '''
        This function vectorize each document in the corpus
        :param corpus: pd.series of documents
        :return: feature: vectorized documents
        '''
        feature = np.empty((corpus.shape[0],200))
        sent_lst = list(corpus)
        print('W2V vectorization: get word vectors')
        vec_lst = [[self.model[word] for word in sent if word in self.model.vocab] for sent in tqdm(sent_lst)]
        print('W2V vectorization: get sentence vectors')
        for i in tqdm(list(range(feature.shape[0]))):
            feature[i,:] = np.mean(vec_lst[i],axis=0,keepdims=False) # continuous bag-of-word with average pooling
        return feature


def ErrorAnalysis(preds,GT_label,GT_text):
    '''
    Return DataFrame of FN and FP texts and indexes

    args:
    preds: predicted labels
    GT_label: ground truth labels
    val_text: original text before vectorization

    output:
    Error_DF: DataFrame that contains FN text and index, FP text and index
    '''
    False_Neg = []
    False_Pos = []
    False_Neg_Idx = []
    False_Pos_Idx = []
    for idx in range(preds.shape[0]):
        if preds[idx] == 1 and GT_label[idx] == 0:
            False_Pos.append(GT_text[idx])
            False_Pos_Idx.append(idx)
        elif preds[idx] == 0 and GT_label[idx] == 1:
            False_Neg.append(GT_text[idx])
            False_Neg_Idx.append(idx)
    Error_dict = {'False_Positive':False_Pos,'False_Negative':False_Neg,'False_Pos_Idx':False_Pos_Idx,'False_Neg_Idx':False_Neg_Idx}
    Error_DF = pd.DataFrame.from_dict(Error_dict,orient='index').transpose()
    return Error_DF

In [3]:
#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'CLEAN_TEXT'

#Load Dataset
Train_Data, Val_Data = pd.read_csv("./Data/train.csv"),pd.read_csv("./Data/val.csv")
train_corpus, val_corpus= Train_Data[TEXT], Val_Data[TEXT]

#Preprocessing labels form boolean to int
Y_train, Y_val = Train_Data['label'].astype(int), Val_Data['label'].astype(int)

In [9]:
import transformers
from transformers import BertTokenizer, BertModel

In [62]:
import time
tik = time.time()
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
tok = time.time()
print('Model loading time: ', tok-tik)

Model loading time:  4.049298048019409


In [13]:
import torch

In [43]:
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

batch_sentences = list(train_corpus)
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True, max_length=512, add_special_tokens = True)
#encoded_inputs = tokenizer.encode(batch_sentences, padding=True, truncation=True, max_length=512, add_special_tokens = True)

train_emb=[]#list to store all embeddings
for i,text in enumerate(encoded_inputs['input_ids']):
    
    tokens_tensor = torch.tensor([encoded_inputs['input_ids'][i]])
    segments_tensors = torch.tensor([encoded_inputs['attention_mask'][i]])

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]


    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    train_emb.append(sentence_embedding)

In [44]:
train_emb = np.array(train_emb)
np.save('./Embeddings/BioBERT_CLEAN_train.npy',train_emb)

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


In [47]:
train_emb[0].shape

torch.Size([768])

In [45]:
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

import time
tik = time.time()
batch_sentences = list(val_corpus)
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True, max_length=512, add_special_tokens = True)
#encoded_inputs = tokenizer.encode(batch_sentences, padding=True, truncation=True, max_length=512, add_special_tokens = True)

val_emb=[]#list to store all embeddings
for i,text in enumerate(encoded_inputs['input_ids']):
    
    tokens_tensor = torch.tensor([encoded_inputs['input_ids'][i]])
    segments_tensors = torch.tensor([encoded_inputs['attention_mask'][i]])

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]


    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    val_emb.append(sentence_embedding)
tok = time.time()

print(tok-tik)

val_emb = np.array(val_emb)
np.save('./Embeddings/BioBERT_CLEAN_val.npy',val_emb)

892.0793886184692




In [51]:
#This is for the ensemble learning
#Change here to choose corpus from STEM_TEXT or CLEAN_TEXT
TEXT = 'CLEAN_TEXT'

#Load Dataset
Train_Data, Fusion_Data, Val_Data = pd.read_csv("./Data/sep_train.csv"), pd.read_csv("./Data/fusion_train.csv"), pd.read_csv("./Data/val.csv")
train_corpus, fusion_corpus, val_corpus= Train_Data[TEXT], Fusion_Data[TEXT], Val_Data[TEXT]

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

import time
tik = time.time()
batch_sentences = list(train_corpus)
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True, max_length=512, add_special_tokens = True)
#encoded_inputs = tokenizer.encode(batch_sentences, padding=True, truncation=True, max_length=512, add_special_tokens = True)

train_emb=[]#list to store all embeddings
for i,text in enumerate(encoded_inputs['input_ids']):
    
    tokens_tensor = torch.tensor([encoded_inputs['input_ids'][i]])
    segments_tensors = torch.tensor([encoded_inputs['attention_mask'][i]])

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]


    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    train_emb.append(sentence_embedding.numpy())
tok = time.time()

print(tok-tik)    
train_emb = np.array(train_emb)
np.save('./Embeddings/BioBERT_CLEAN_train_sep.npy',train_emb)

3315.191195011139


In [4]:
#Instancing vectorizer
vectorizer_dict = {
    'BioBERT':None
}

train_emb = np.load('./Embeddings/BioBERT_CLEAN_train.npy',allow_pickle=True)
val_emb = np.load('./Embeddings/BioBERT_CLEAN_val.npy',allow_pickle=True)

X_train_list = []
X_val_list = []

for i in range(train_emb.shape[0]):
    X_train_list.append(train_emb[i].numpy())

X_train = np.row_stack(X_train_list)

for i in range(val_emb.shape[0]):
    X_val_list.append(val_emb[i].numpy())

X_val = np.row_stack(X_val_list)

In [5]:
X_train

array([[ 0.05710662, -0.49839568, -0.3904479 , ..., -0.03714407,
         0.02263509,  0.28453067],
       [ 0.31401798, -0.5441138 , -0.2812715 , ...,  0.03073541,
        -0.31362376,  0.57005215],
       [ 0.10284638, -0.74936634, -0.44196323, ...,  0.1328728 ,
        -0.1418696 ,  0.4265636 ],
       ...,
       [-0.31160975, -0.29974765, -0.1496599 , ..., -0.05437682,
        -0.08385789,  0.6817429 ],
       [-0.12717518, -0.3699243 , -0.3549909 , ...,  0.05982064,
         0.08986731,  0.6321629 ],
       [-0.02630073, -0.3212516 , -0.28552315, ..., -0.0172112 ,
        -0.21310963,  0.51563746]], dtype=float32)

In [5]:
#select each of the vectorizer
for vect_NAME in vectorizer_dict:
    print(vect_NAME)
    vectorizer = vectorizer_dict[vect_NAME]

    clf = LogisticRegression(C = 10,max_iter=1000,penalty='l2')
    
    #bootstrapping experiments: initialize the performance recorders
    vectorization_time = []
    modeling_time = []
    auroc = []
    auprc = []
    f1_score_recorder = []
    accuracy = []
    predicted_prob = []
    train_prob = []
    train_label = []
    for i in range(100):
        np.random.seed(i)
        idx = np.random.choice(train_corpus.shape[0],train_corpus.shape[0],replace=True)
        X_train_boot = X_train[idx]
        Y_train_boot = Y_train[idx]
        train_label.append(Y_train_boot)
        
        
        #Fit vectorizer using train corpus
        tik = time.time()
        tok = time.time()
        vectorization_time.append(tok-tik)
        
        #Fit classifier using logistic regression
        tik = time.time()
        clf.fit(X_train_boot,Y_train_boot)
        tok = time.time()
        modeling_time.append(tok-tik)
        
        
        #Make prediction and evaluate the prediction
        preds, pred_probs = getPredicts(clf, X_val)
        predicted_prob.append(pred_probs)
        result = modelEval(Y_val, preds, pred_probs)
        
        #Get training probability
        train_preds, train_pred_probs = getPredicts(clf, X_train_boot)
        train_prob.append(train_pred_probs)
        
        #print(result)
        auroc.append(result['auroc'])
        auprc.append(result['auprc'])
        accuracy.append(result['accuracy'])
        f1_score_recorder.append(result['f1_score'])
    result_df = pd.DataFrame({'AUROC': auroc,'AUPRC': auprc,'Accuracy': accuracy,'F1_score': f1_score_recorder,'Vectorization_Time': vectorization_time,'Modeling_Time': modeling_time})
    result_df.to_csv('.//Result//Bootstrapping_'+vect_NAME+'.csv', index = False)
    np.save('.//Result//Bootstrapping_predicted_probability_'+vect_NAME+'.npy',np.array(predicted_prob))
    np.save('.//Result//Bootstrapping_predicted_probability_training_'+vect_NAME+'.npy',np.array(train_prob))
    np.save('.//Result//Bootstrapping_label_training_'+vect_NAME+'.npy',np.array(train_label))
    print(result_df.mean())
    print(result_df.quantile(0.025))
    print(result_df.quantile(0.975))

BioBERT


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

AUROC                 0.978923
AUPRC                 0.979658
Accuracy              0.931401
F1_score              0.932507
Vectorization_Time    0.000001
Modeling_Time         8.570311
dtype: float64
AUROC                 9.767317e-01
AUPRC                 9.773895e-01
Accuracy              9.256776e-01
F1_score              9.267743e-01
Vectorization_Time    9.536743e-07
Modeling_Time         5.792262e+00
Name: 0.025, dtype: float64
AUROC                 0.981124
AUPRC                 0.981648
Accuracy              0.936123
F1_score              0.936972
Vectorization_Time    0.000002
Modeling_Time         9.816015
Name: 0.975, dtype: float64


In [61]:
clf.fit(X_train,Y_train)
#Make prediction and evaluate the prediction
preds, pred_probs = getPredicts(clf, X_val)
result = modelEval(Y_val, preds, pred_probs)
print(result)

{'auroc': 0.9833197677358209, 'accuracy': 0.9422738472368885, 'auprc': 0.9837576671105525, 'f1_score': 0.943213296398892}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
