In [280]:
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_auc_score,matthews_corrcoef, precision_recall_curve
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import fbeta_score, make_scorer

## Scenario 2: M1+M2+..+T1+T2+..+T300

In [1]:
def scenario_2(datasets,datasets_meta,datasets_names,best_text_settings):
    #create score dataframe
    scores_df=pd.DataFrame(columns=['Dataset','Algorithm','Smoten','Undersampling','Classweights','Sampleweights','F2-score (with thresholding','F2-score (without) thresholding','# times meta-feature in top20','# times meta-feature in top10','highest importance'])
    
    # find best text representation of current email dataset
    best_settings=find_best_settings(best_text_settings)
    count=0
    
    #perform scenario 2 for all 3 dataset (iterate 3 times)
    for i in range(0,3):
        #return data objects
        dataset,metadataset,dataset_name,row=return_parts(datasets,datasets_meta,datasets_names,best_settings,i)
        metadataset['TOKENS']=dataset['TOKENS']
        
        #init score lists
        f2_scores=[]
        f2_scores_no_thresh=[]
        
        #linear svm for transparency
        for model in ['svm']:
            for undersampling in [True,False]:
                
                #init variables that count how many times metafeatures occur in top20 and top 10features of model
                times_meta_features_top20=0
                times_meta_features_top10=0
                highest_importance=0
                
                #validate 100 times
                for validations in range(0,100):
                    
                    #split train test
                    X_train,X_test,y_train,y_test=train_test_split(metadataset.drop('RESPONSIVE',axis=1),metadataset['RESPONSIVE'], test_size=0.15)
                    
                    #train classifier
                    f2_score_train,f2_no_thresh_train,best_params,random_search=train_classifier(X_train,y_train,model,2,2,True,False,row['Text representation'],row['Word/char'],row['Weighting'])
                    
                    #test and validate classifier
                    f2_score,f2_no_thresh,estimator,columns=test_classifier(model,best_params,X_train,y_train,X_test,y_test,random_search,undersampling,row['Text representation'],row['Word/char'],row['Weighting'])
                
                    #check for importances in top20 and top10
                    times_meta_features_top20+=check_importances(columns,estimator,model,best_params)[0]
                    times_meta_features_top10+=check_importances(columns,estimator,model,best_params)[1]
                    
                    #check if this is the highest importance
                    if check_importances(columns,estimator,model,best_params)[2]>highest_importance:
                        highest_importance=check_importances(columns,estimator,model,best_params)[2]

                    #update score lists
                    f2_scores.append(f2_score)
                    f2_scores_no_thresh.append(f2_no_thresh)
                
                #update scores dataframe
                scores_df.loc[len(scores_df)]=[dataset_name,model,True,undersampling,True,True,np.mean(f2_scores),np.mean(f2_scores_no_thresh),times_meta_features_top20,times_meta_features_top10,highest_importance]
    return scores_df

In [287]:
def test_classifier(model,best_params,X_train,y_train,X_test,y_test,random_search,undersampling,text_representation,wordchar,weighting):
    text_train,text_test=obtain_text_representation(X_train['TOKENS'].astype(str),X_test['TOKENS'].astype(str),text_representation,wordchar,weighting)
    X_train=pd.concat([X_train.drop('TOKENS',axis=1), text_train], axis = 1)
    X_test=pd.concat([X_test.drop('TOKENS',axis=1), text_test], axis = 1)
    
    sme=SMOTEENN()
    X_train, y_train = sme.fit_resample(X_train,y_train)
    if undersampling:
        under=RandomUnderSampler(sampling_strategy=1)
        X_train, y_train =under.fit_resample(X_train,y_train)
    class_weight='balanced'
    
    model=fill_in_algo_params(model,best_params,ratio_w)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    y_probas=model.predict_proba(X_test)
    
    f2_score_before_thresholding=fbeta_score(y_test,y_pred,beta=2)
    print('f2 score before thresholding {}'.format(f2_score_before_thresholding))
    p, r, thresholds = precision_recall_curve(y_test, y_probas[:,1])

    f2_list=[]
    for i in range(0,len(p)):
        f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))

    t=thresholds[f2_list.index(max(f2_list))]
    new_labels=adjusted_classes(y_probas[:,1],t)
    f2_score=fbeta_score(y_test,new_labels,beta=2)
    
    return f2_score,f2_score_before_thresholding,model,X_train.columns

In [2]:
def train_classifier(data,model,iterations,cv,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
    #this function does al preprocessing steps and trains a given classifier in a standardized way
    
    #split train/test
    X_train, X_test, y_train, y_test = train_test_split(data['TOKENS'],   #ALLEEN VOOR ENRON
                                                    data['RESPONSIVE'], 
                                                    test_size=0.15)
    X_train=X_train.astype(str)
    X_test=X_test.astype(str)    
    
    #extract n-grams from the text
    ngram_range = extract_ngram(text_representation)
    
    # perform a text representation method on the data
    if feature_weighting =='TF-IDF':
        features_train,features_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),True)
    elif feature_weighting =='TF':
        features_train,features_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),False)
    elif feature_weighting=='BINARY':
        features_train,features_test=perform_countvectorizer(X_train,X_test,ngram_range,wordchar.lower())
    elif feature_weighting=='INFO-GAIN':
        features_train,features_test=perform_info_gain_df(X_train,X_test,y_train,ngram_range,wordchar.lower())
    
    labels_train = y_train
    labels_test = y_test
    
    #determine usage of SMOTE
    if smoteennyes:
        features_train, labels_train = SMOTEENN().fit_resample(features_train, labels_train)
        
    #determine usage of undersampling    
    if underyes:
        features_train, labels_train =RandomUnderSampler(sampling_strategy=1).fit_resample(features_train,labels_train)
    
    #determine usage of class weights
    if classweightyes:
        class_weight='balanced'
    else:
        class_weight=None
    
    #initialize estimator
    if model=='svm':
        estimator= SVC(random_state=8)
        random_grid=svm_grid
    elif model=='rf':
        estimator= RandomForestClassifier(random_state=8)
        random_grid=rf_grid
    elif model=='cnb':
        estimator= ComplementNB()
        random_grid=cnb_grid
    elif model=='xgb':
        estimator=XGBClassifier(random_state=8)
        ratio_w=len(data[data['RESPONSIVE']==0])/len(data[data['RESPONSIVE']==1])
        random_grid=xgb_grid
    
    # init the random search algorithm
    random_search = RandomizedSearchCV(estimator=estimator,
                                   param_distributions=random_grid,
                                   n_iter=iterations,
                                   scoring=ftwo_scorer,
                                   cv=cv, 
                                   verbose=1, 
                                   refit=True)
    
    #check for usage of sample weights and fit random search algorithm
    if sampleweightyes:
        sample_weight = compute_sample_weight(class_weight='balanced', y=labels_train)
        random_search.fit(features_train, labels_train,sample_weight=sample_weight)
    else:
        random_search.fit(features_train, labels_train)
   
    y_pred=random_search.predict(features_test)
    y_probas=random_search.predict_proba(features_test)
    
    f2_score_before_thresholding=fbeta_score(labels_test,y_pred,beta=2)
    p, r, thresholds = precision_recall_curve(y_test, y_probas[:,1])

    f2_list=[]
    
    #calculate f2 scores for each threshold
    for i in range(0,len(p)):
        f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
    
    #find optimal threshold
    t=thresholds[f2_list.index(max(f2_list))]
    
    #calculate new labels
    new_labels=adjusted_classes(y_probas[:,1],t)
    
    #calculate f2 score after thresholding
    f2_score=fbeta_score(y_test,new_labels,beta=2)
    
    return f1_score(labels_test,y_pred),f2_score,f2_score_before_thresholding,random_search.best_params_

class_weight='balanced'
ratio_w=1

svm_grid = {'C': [.01,.1,.4,.6,1,1.5],
              'kernel': [ 'rbf', 'poly','linear'],
              'gamma':  [.01, .1, 1],
              'degree': [1, 2, 3, 4, 5],
              'probability': [True],
              'class_weight':[class_weight]
             }

xgb_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'scale_pos_weight':[ratio_w] 
}

rf_grid={'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False],
                'class_weight':['balanced']}

cnb_grid={'alpha':[0,0.3,0.6,0.8,1], 
          'fit_prior':[False,True],
          'norm':[False,True]}


def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    """
    return [1 if y >= t else 0 for y in y_scores]


ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [3]:
def check_importances(columns, estimator,model,best_params):
    #check if there are metafeatures in top 10/20 features 
    
    #init variables and dataframe
    return_20=0
    return_10=0
    highest_importance=0
    importances=pd.DataFrame()
    importances['Feature names']=columns
     
    if model=='svm':
        #obtain importances
        importances['Importance']=estimator.feature_importances_
        
        #check if in top20
        if 20-len(set(importances.sort_values('Importance',ascending=False)[0:20]['Feature names']).difference(set(meta_feature_names)))>0:
           # print(20-len(set(importances.sort_values('Importance',ascending=False)[0:20]['Feature names']).difference(set(meta_feature_names))))
            return_20=1
            highest_importance=importances[importances['Feature names'].isin(meta_feature_names)].head(1)['Importance']
            
        #check if in top10
        if 10-len(set(importances.sort_values('Importance',ascending=False)[0:10]['Feature names']).difference(set(meta_feature_names)))>0:
            return_10=1
            highest_importance=importances[importances['Feature names'].isin(meta_feature_names)].head(1)['Importance']
    
        #check what the importance is
        if not isinstance(highest_importance, int):
            return return_10,return_20,highest_importance.loc[0]
    return return_10,return_20,highest_importance

        
def fill_in_algo_params(model,best_params,ratio_w):
    #init algorithms and their best parameters according to the text classifiers
    if model=='svm':
        estimator= SVC(C=best_params['C'],kernel=best_params['kernel'],gamma=best_params['gamma'],degree=best_params['degree'],probability=best_params['probability'],class_weight=class_weight)
    elif model=='rf':
        estimator= RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],bootstrap=best_params['bootstrap'],class_weight='balanced')
    elif model=='cnb':
        estimator= ComplementNB(alpha=best_params['alpha'],fit_prior=best_params['fit_prior'],norm=best_params['norm'])
    elif model=='xgb':
        estimator=XGBClassifier(min_child_weight=best_params['min_child_weight'],gamma=best_params['gamma'],subsample=best_params['subsample'],colsample_bytree=best_params['colsample_bytree'],max_depth=best_params['max_depth'], scale_pos_weight=ratio_w)
    return estimator

def return_parts(datasets,datasets_meta,datasets_names,best_settings,i):
    #return correct data objects
    dataset=datasets[i]
    metadataset=datasets_meta[i]
    dataset_name=datasets_names[i]
    row=best_settings[i]
    return dataset,metadataset,dataset_name,row

def find_best_settings(best_settings):
    #find best parameter configuration
    row_2020=best_text_settings['2020'].loc[best_text_settings['2020']['F2-score'].idxmax()]    
    row_whiskey=best_text_settings['whiskey'].loc[best_text_settings['whiskey']['F2-score'].idxmax()]
    row_enron=best_text_settings['enron'].loc[best_text_settings['enron']['F2-score'].idxmax()]
    return [row_2020,row_whiskey,row_enron]


def obtain_text_representation(X_train,X_test,representation,wordchar,weighting):
    #transform text data to the correct representation
    ngram_range=extract_ngram(representation)
    if weighting=='TF':
        X_train,X_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),False)
    elif weighting=='TF-IDF':
        X_train,X_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),True)
    return X_train,X_test

def extract_ngram(text_representation):
    #obtain correct format 
    
    if text_representation=='UNIGRAM':
        ngram_range=(1,1)
    elif text_representation=='BIGRAM':
        ngram_range=(1,2)
    elif text_representation=='TRIGRAM':
        ngram_range=(1,3)
    return ngram_range

def perform_tf_idf(X_train,X_test,ngram_range,analyzer,use_idf):
    #perform tf-idf vectorization
    
    min_df = 10
    max_df = 1.
    max_features = 300
    tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                            analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                           use_idf=use_idf)
    features_train = tfidf.fit_transform(X_train).toarray()
    features_test = tfidf.transform(X_test).toarray()
    features_train=pd.DataFrame(features_train, columns=tfidf.get_feature_names(),index=X_train.index)
    features_test=pd.DataFrame(features_test, columns=tfidf.get_feature_names(),index=X_test.index)
    return features_train,features_test

def which_algorithm(model,y_train_1):
    #init estimator for train_classifier function
    
    if model=='svm':
        estimator= SVC(random_state=8)
        random_grid=svm_grid
        ratio_w=None
    elif model=='rf':
        estimator= RandomForestClassifier(random_state=8)
        random_grid=rf_grid
        ratio_w=None
    elif model=='cnb':
        estimator= ComplementNB()
        ratio_w=None
        random_grid=cnb_grid
    elif model=='xgb':
        estimator=XGBClassifier(random_state=8)
        ratio_w=y_train_1.value_counts()[0]/y_train_1.value_counts()[1]
        random_grid=xgb_grid
    return estimator,random_grid, ratio_w