In [5]:
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_auc_score,matthews_corrcoef, precision_recall_curve
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import numpy as np


Using TensorFlow backend.


# Training loop with settings 

In [8]:
def extract_ngram(text_representation):
    #transform string text representation to suitable format for sklearn vectorizers
    if text_representation=='UNIGRAM':
        ngram_range=(1,1)
    elif text_representation=='BIGRAM':
        ngram_range=(1,2)
    elif text_representation=='TRIGRAM':
        ngram_range=(1,3)
    return ngram_range

def perform_tf_idf(X_train,X_test,ngram_range,analyzer,use_idf):
    #perform tfidf vectorization 
    
    #initialize values and tfidf-vectorizer
    min_df,max_df,max_features = 10, 1., 300
    tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                            analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                           use_idf=use_idf)
    
    #transform text data to tfidf representation
    features_train = tfidf.fit_transform(X_train).toarray()
    features_test = tfidf.transform(X_test).toarray()
    return features_train,features_test

def perform_countvectorizer(X_train,X_test,ngram_range,analyzer):
    #initialize values for countvectorizer
    min_df,max_df,max_features = 10, 1., 300
   
    countvect = CountVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features)
    
    #transform text data to binary features
    features_train = countvect.fit_transform(X_train).toarray()
    features_test = countvect.transform(X_test).toarray()
    return features_train,features_test    

def perform_info_gain_df(X_train,X_test,labels,ngram_range,analyzer):
    from info_gain import info_gain
    
    #initialize values and countvectorizer
    X_train=X_train.astype(str)
    X_test=X_test.astype(str)
    min_df,max_df,max_features = 10, 1., 300
    countvect = CountVectorizer(encoding='utf-8',ngram_range=ngram_range,analyzer=analyzer,stop_words=None,
                                lowercase=False,max_df=max_df,min_df=min_df,max_features=max_features,binary=True)
  
    #transform text data to binary representation
    dftrain = pd.DataFrame(countvect.fit_transform(X_train).toarray(), columns=countvect.get_feature_names())
    dftest = pd.DataFrame(countvect.transform(X_test).toarray(), columns=countvect.get_feature_names())
    
    dftrain['RESPONSIVE']=labels
    
    #initialize dictonary with info gain per feature
    ig_dict={}
    for column in dftrain.columns:
        ig  = info_gain.info_gain(dftrain['RESPONSIVE'], dftrain[column])
        ig_dict[column]=ig
    
    for column in dftrain.columns:
        info_gain=ig_dict[column]
        dftrain[column]=np.where(dftrain[column] == 0, 0, info_gain)
    for column in dftest.columns:
        info_gain=ig_dict[column]
        dftest[column]=np.where(dftest[column] == 0, 0, info_gain)
    return dftrain.loc[:, dftrain.columns != 'RESPONSIVE'],dftest

    
def train_classifier(data,model,iterations,cv,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
    #this function does al preprocessing steps and trains a given classifier in a standardized way
    
    #split train/test
    X_train, X_test, y_train, y_test = train_test_split(data['TOKENS'],   #ALLEEN VOOR ENRON
                                                    data['RESPONSIVE'], 
                                                    test_size=0.15)
    X_train=X_train.astype(str)
    X_test=X_test.astype(str)    #weet niet zeker of dit moet
    
    #extract n-grams from the text
    ngram_range = extract_ngram(text_representation)
    
    # perform a text representation method on the data
    if feature_weighting =='TF-IDF':
        features_train,features_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),True)
    elif feature_weighting =='TF':
        features_train,features_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),False)
    elif feature_weighting=='BINARY':
        features_train,features_test=perform_countvectorizer(X_train,X_test,ngram_range,wordchar.lower())
    elif feature_weighting=='INFO-GAIN':
        features_train,features_test=perform_info_gain_df(X_train,X_test,y_train,ngram_range,wordchar.lower())
    
    labels_train = y_train
    labels_test = y_test
    
    #determine usage of SMOTE
    if smoteennyes:
        features_train, labels_train = SMOTEENN().fit_resample(features_train, labels_train)
        
    #determine usage of undersampling    
    if underyes:
        features_train, labels_train =RandomUnderSampler(sampling_strategy=1).fit_resample(features_train,labels_train)
    
    #determine usage of class weights
    if classweightyes:
        class_weight='balanced'
    else:
        class_weight=None
    
    #initialize estimator
    if model=='svm':
        estimator= SVC(random_state=8)
        random_grid=svm_grid
    elif model=='rf':
        estimator= RandomForestClassifier(random_state=8)
        random_grid=rf_grid
    elif model=='cnb':
        estimator= ComplementNB()
        random_grid=cnb_grid
    elif model=='xgb':
        estimator=XGBClassifier(random_state=8)
        ratio_w=len(data[data['RESPONSIVE']==0])/len(data[data['RESPONSIVE']==1])
        random_grid=xgb_grid
    
    # init the random search algorithm
    random_search = RandomizedSearchCV(estimator=estimator,
                                   param_distributions=random_grid,
                                   n_iter=iterations,
                                   scoring=ftwo_scorer,
                                   cv=cv, 
                                   verbose=1, 
                                   refit=True)
    
    #check for usage of sample weights and fit random search algorithm
    if sampleweightyes:
        sample_weight = compute_sample_weight(class_weight='balanced', y=labels_train)
        random_search.fit(features_train, labels_train,sample_weight=sample_weight)
    else:
        random_search.fit(features_train, labels_train)
   
    y_pred=random_search.predict(features_test)
    y_probas=random_search.predict_proba(features_test)
    
    f2_score_before_thresholding=fbeta_score(labels_test,y_pred,beta=2)
    p, r, thresholds = precision_recall_curve(y_test, y_probas[:,1])

    f2_list=[]
    
    #calculate f2 scores for each threshold
    for i in range(0,len(p)):
        f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
    
    #find optimal threshold
    t=thresholds[f2_list.index(max(f2_list))]
    
    #calculate new labels
    new_labels=adjusted_classes(y_probas[:,1],t)
    
    #calculate f2 score after thresholding
    f2_score=fbeta_score(y_test,new_labels,beta=2)
    
    return f1_score(labels_test,y_pred),f2_score,f2_score_before_thresholding,random_search.best_params_


# init variable values and create parameter grid for estimators
class_weight='balanced'
ratio_w=1

svm_grid = {'C': [.01,.1,.4,.6,1,1.5],
              'kernel': [ 'rbf', 'poly','linear'],
              'gamma':  [.01, .1, 1],
              'degree': [1, 2, 3, 4, 5],
              'probability': [True],
              'class_weight':[class_weight]
             }

xgb_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'scale_pos_weight':[ratio_w] 
}

rf_grid={'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False],
                'class_weight':['balanced']}

cnb_grid={'alpha':[0,0.3,0.6,0.8,1], 
          'fit_prior':[False,True],
          'norm':[False,True]}


def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    """
    return [1 if y >= t else 0 for y in y_scores]


ftwo_scorer = make_scorer(fbeta_score, beta=2)

## Testing function

In [9]:
def do_testing(data,model,best_params,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
    # testing function that does unique train/test split and applies already found parameters for validation
    
    #split train/test
    X_train, X_test, y_train, y_test = train_test_split(data['TOKENS'], 
                                                    data['RESPONSIVE'], 
                                                    test_size=0.15,stratify=data['RESPONSIVE'])
                                                    
    #split train/val
    X_train_1, X_val, y_train_1, y_val=train_test_split(X_train,  
                                                    y_train, 
                                                    test_size=0.15,stratify=y_train)               
    
    
    def fit_and_val_model(model,best_params,X_train_1,y_train_1,X_val,y_val,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
        #fit and validation function
        
        
        X_train_1,X_val=X_train_1.astype(str),X_val.astype(str) 
        
        #extract ngram representation
        ngram_range = extract_ngram(text_representation)
        
        #perform vectorization of text data
        if feature_weighting =='TF-IDF':
            X_train_1,X_val=perform_tf_idf(X_train_1,X_val,ngram_range,wordchar.lower(),True)
        elif feature_weighting =='TF':
            X_train_1,X_val=perform_tf_idf(X_train_1,X_val,ngram_range,wordchar.lower(),False)
        elif feature_weighting=='BINARY':
            X_train_1,X_val=perform_countvectorizer(X_train_1,X_val,ngram_range,wordchar.lower())
        elif feature_weighting=='INFO-GAIN':
            X_train_1,X_val=perform_info_gain_df(X_train_1,X_val,y_train_1,ngram_range,wordchar.lower())
            
        y_train_1,y_val = y_train_1,y_val
        
        #check whether SMOTE has to be performed
        if smoteennyes:
            X_train_1, y_train_1 = SMOTEENN().fit_resample(X_train_1, y_train_1)
        
        #check whether undersampling has to be performed
        if underyes:
            X_train_1,y_train_1 =RandomUnderSampler(sampling_strategy=1).fit_resample(X_train_1,y_train_1)
        
        #check whether classweights have to be applied
        if classweightyes:
            class_weight='balanced'
        else:
            class_weight=None
        
        #check which estimator will be used and use optimal parameters
        if model=='svm':
            estimator= SVC(C=best_params['C'],kernel=best_params['kernel'],gamma=best_params['gamma'],degree=best_params['degree'],probability=best_params['probability'],class_weight=class_weight)
        elif model=='rf':
            estimator= RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],bootstrap=best_params['bootstrap'],class_weight='balanced')
        elif model=='cnb':
            estimator= ComplementNB(alpha=best_params['alpha'],fit_prior=best_params['fit_prior'],norm=best_params['norm'])
        elif model=='xgb':
            ratio_w=y_train_1.value_counts()[0]/y_train_1.value_counts()[1]
            estimator=XGBClassifier(min_child_weight=best_params['min_child_weight'],gamma=best_params['gamma'],subsample=best_params['subsample'],colsample_bytree=best_params['colsample_bytree'],max_depth=best_params['max_depth'], scale_pos_weight=ratio_w)
         
        #check whether sample weights have to be applied and fit estimator
        if sampleweightyes:
            sample_weight = compute_sample_weight(class_weight='balanced', y=y_train_1)
            estimator.fit(X_train_1, y_train_1,sample_weight=sample_weight)
        else:
            estimator.fit(X_train_1,y_train_1)
        
        #predict labels
        y_pred=estimator.predict(X_val)
        
        #predict probabilities
        y_probas=estimator.predict_proba(X_val)
        
        #calculate f2 score before thresholding
        f2_score_before_thresholding=fbeta_score(y_val,y_pred,beta=2)
        
        #calculate optimal threshold
        p, r, thresholds = precision_recall_curve(y_val, y_probas[:,1])
        f2_list=[]
        for i in range(0,len(p)):
            f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
        t=thresholds[f2_list.index(max(f2_list))]
        
        #calculate new labels and optimal f2 score
        new_labels=adjusted_classes(y_probas[:,1],t)
        f2_score=fbeta_score(y_val,new_labels,beta=2)
        return t

    def test_model(model,best_params,X_train,y_train,X_test,y_test,t,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
        #test model for final validation
        
        
        X_train,X_test=X_train.astype(str),X_test.astype(str) 
        
        #extract ngram representation
        ngram_range = extract_ngram(text_representation)
        
        #perform vectorization of text data
        if feature_weighting =='TF-IDF':
            X_train,X_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),True)
        elif feature_weighting =='TF':
            X_train,X_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),False)
        elif feature_weighting=='BINARY':
            X_train,X_test=perform_countvectorizer(X_train,X_test,ngram_range,wordchar.lower())
        elif feature_weighting=='INFO-GAIN':
            X_train,X_test=perform_info_gain_df(X_train,X_test,y_train,ngram_range,wordchar.lower())   
        
        #check whether SMOTE has to be performed
        if smoteennyes:
            X_train, y_train = SMOTEENN().fit_resample(X_train, y_train)
        
        #check whether undersampling has to be performed
        if underyes:
            X_train,y_train =RandomUnderSampler(sampling_strategy=1).fit_resample(X_train,y_train)
        
        #check whether classweights have to be applied
        if classweightyes:
            class_weight='balanced'
        else:
            class_weight=None
       
        #check which estimator will be used and use optimal parameters
        if model=='svm':
            estimator= SVC(C=best_params['C'],kernel=best_params['kernel'],gamma=best_params['gamma'],degree=best_params['degree'],probability=best_params['probability'],class_weight=class_weight)
        elif model=='rf':
            estimator= RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],bootstrap=best_params['bootstrap'],class_weight='balanced')
        elif model=='cnb':
            estimator= ComplementNB(alpha=best_params['alpha'],fit_prior=best_params['fit_prior'],norm=best_params['norm'])
        elif model=='xgb':
            ratio_w=y_train.value_counts()[0]/y_train.value_counts()[1]
            estimator=XGBClassifier(min_child_weight=best_params['min_child_weight'],gamma=best_params['gamma'],subsample=best_params['subsample'],colsample_bytree=best_params['colsample_bytree'],max_depth=best_params['max_depth'], scale_pos_weight=ratio_w)
         
        #check whether sample weights have to be applied and fit estimator
        if sampleweightyes:
            sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
            estimator.fit(X_train, y_train,sample_weight=sample_weight)
        else:
            estimator.fit(X_train,y_train)
        
        #predict labels
        y_pred=estimator.predict(X_test)
        
        #predict probabilities
        y_probas=estimator.predict_proba(X_test)
        
        #calculate f2 score before thresholding
        f2_score_without_thresholding=fbeta_score(y_test,y_pred,beta=2)
        
        #calculate new labels and optimal f2 score
        new_labels=adjusted_classes(y_probas[:,1],t)
        f2_score=fbeta_score(y_test,new_labels,beta=2)
        
        #calculate roc-auc score,precision,fpr,fnr and recall
        roc_score=roc_auc_score(y_test,y_probas[:,1])
        tn, fp, fn, tp = confusion_matrix(y_test,new_labels).ravel()
        precision=tp/(tp+fp)
        fpr = fp/(fp+tn)
        fnr=fn/(fn+tp)
        recall=tp/(tp+fn)
        
        return f2_score, f2_score_without_thresholding,roc_score,precision,fpr,fnr,recall
    
    #fit and validate model and use optimal threshold of training data on test data
    t=fit_and_val_model(model,best_params,X_train_1,y_train_1,X_val,y_val,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting)
    return test_model(model,best_params,X_train,y_train,X_test,y_test,t,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting)



def testing_statistics(data,model,best_params,iterations,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):  #oneven aantal!!!
    #this function validates the model x iterations and saves all scores
    
    f2_thresh_scores=[]
    f2_no_thresh_scores=[]
    roc_scores=[]
    precision_scores=[]
    fpr_scores=[]
    fnr_scores=[]
    recall_scores=[]
    
    for iter in range(0,iterations):
        f2_thresh,f2_no_thresh,roc_score,precision,fpr,fnr,recall=do_testing(data,model,best_params,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting)
        f2_thresh_scores.append(f2_thresh)
        f2_no_thresh_scores.append(f2_no_thresh)
        roc_scores.append(roc_score)
        precision_scores.append(precision)
        fpr_scores.append(fpr)
        fnr_scores.append(fnr)
        recall_scores.append(recall)
    return f2_thresh_scores, f2_no_thresh_scores,roc_scores,precision_scores,fpr_scores,fnr_scores,recall_scores