In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_curve,roc_auc_score,matthews_corrcoef, precision_recall_curve
from sklearn.metrics import fbeta_score, make_scorer,accuracy_score
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from matplotlib import pyplot
from sklearn.metrics import average_precision_score
import os
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [6]:
#fill in optimal settings per text classifier per dataset
top_models=pd.DataFrame(columns=['Dataset','Text representation','Word/char','Weighting','Algorithm','Smoteenn','Undersampling','Classweights','Sampleweights'])
top_models.loc[0]=['WHISKEY','BIGRAM','WORD','TF','rf','TRUE','TRUE','FALSE','TRUE']
top_models.loc[1]=['2020','TRIGRAM','WORD','TF-IDF','svm','TRUE','TRUE','TRUE','TRUE']
top_models.loc[2]=['ENRON','UNIGRAM','WORD','TF','rf','FALSE','TRUE','TRUE','FALSE']


In [3]:
def cross_domain_report(boolean,transfername,t,random_search,y_pred,X_target,y_target,labels_test,visualisatie_controle,balanced_learning):
    #this function reports the performance metrics of the model
    
    f2_score_before_thresholding=fbeta_score(labels_test,y_pred,beta=2)

    tn, fp, fn, tp = confusion_matrix(labels_test,y_pred).ravel()
    fnr=fn/(fn+tp)
    fpr=fp/(fp+tn)

    #predict probabilities and obtain new labels with optimal threshold
    probas=random_search.predict_proba(X_target)[:,1]
    new_labels=adjusted_classes(probas,t)
   
    f2_score=fbeta_score(y_target,new_labels,beta=2)
    acc=accuracy_score(y_target,new_labels)
    roc_auc=roc_auc_score(y_target,probas)
    tn, fp, fn, tp = confusion_matrix(labels_test,new_labels).ravel()
    fpr_after_thresh=fp/(fp+tn)
    fnr_after_thresh=fn/(fn+tp)
    
    return roc_auc,f2_score_before_thresholding,fpr,fnr,f2_score,fpr_after_thresh,fnr_after_thresh,acc

def do_cross_domains(transfername,dfsource,dftarget,algorithm,Smoteenn,Undersampling,Classweights,Sampleweights,Text_representation,Wordchar,Weighting,visualisatie_controle,balanced_learning):
    
    #train classifier to predict from domain A to domain B
    random_search,t,X_target,best_params=train_classifier(dfsource,dftarget,algorithm,10,2,Smoteenn,Undersampling,Classweights,Sampleweights,Text_representation,Wordchar,Weighting)
    
    #obtain performance metrics of the predictions
    roc_auc,f2,fpr,fnr,f2_after_thresh,fpr_after_thresh,fnr_after_thresh,acc=cross_domain_report(boolean,transfername,t,random_search,random_search.predict(X_target),X_target,dftarget['RESPONSIVE'],dftarget['RESPONSIVE'],visualisatie_controle,balanced_learning)
    
    return roc_auc,f2,fpr,fnr,f2_after_thresh,fpr_after_thresh,fnr_after_thresh,acc

def apply_best_settings_cross_domain(top_models,dfwhiskey,df2020,visualisatie_controle,balanced_learning):
    #This function does all cross-domain predictions and uses optimal source domain parameters 
    
    #init score dataframe
    plain_baseline=pd.DataFrame(columns=['Source','Target','F2-score','ROC-AUC','FPR','FNR','tussenkolom','F2 on 10% target thresholding','FPR after thresholding','FNR after thresholding','Accuracy'])
    
    #obtain opt parameters
    row_w=top_models.loc[0]
    row_2020=top_models.loc[1]
    
    #apply 1:1 sampling or not 
    if balanced_learning:
        dfwhiskey,df2020=create_balanced_sets(dfwhiskey,dfwhiskey['RESPONSIVE'],df2020,df2020['RESPONSIVE'])
        
    #validate both 100 times
    for count in range(0,100):
        #train and predicts across domains
        roc_auc,f2,fpr,fnr,f2_after_thresh,fpr_after_thresh,fnr_after_thresh,acc,boolean=do_cross_domains(boolean,'whiskey2020',dfwhiskey,df2020,row_w['Algorithm'],row_w['Smoteenn'],row_w['Undersampling'],row_w['Classweights'],row_w['Sampleweights'],row_w['Text representation'],row_w['Word/char'],row_w['Weighting'],visualisatie_controle,balanced_learning)
        
        #update score dataframe
        plain_baseline.loc[count]=['whiskey','2020',f2,roc_auc,fpr,fnr,'-',f2_after_thresh,fpr_after_thresh,fnr_after_thresh,acc]
        
        #write scores to directory every iteration
        plain_baseline.to_excel("Plain ACCURACY baseline 1;1 gesampled.xlsx")

    for count in range(100,200):
        #train and predicts across domains
        roc_auc,f2,fpr,fnr,f2_after_thresh,fpr_after_thresh,fnr_after_thresh,acc,boolean=do_cross_domains(boolean,'2020whiskey',df2020,dfwhiskey,row_2020['Algorithm'],row_2020['Smoteenn'],row_2020['Undersampling'],row_2020['Classweights'],row_2020['Sampleweights'],row_2020['Text representation'],row_2020['Word/char'],row_2020['Weighting'],visualisatie_controle,balanced_learning)    
        
        #update score dataframe
        plain_baseline.loc[count]=['2020','whiskey',f2,roc_auc,fpr,fnr,'-',f2_after_thresh,fpr_after_thresh,fnr_after_thresh,acc]
        
        #write scores to directory every iteration
        plain_baseline.to_excel("Plain ACCURACY baseline 1;1 gesampled.xlsx")

    return plain_baseline


In [4]:
def extract_ngram(text_representation):
    #transform string text representation to suitable format for sklearn vectorizers
    if text_representation=='UNIGRAM':
        ngram_range=(1,1)
    elif text_representation=='BIGRAM':
        ngram_range=(1,2)
    elif text_representation=='TRIGRAM':
        ngram_range=(1,3)
    return ngram_range

def perform_tf_idf(X_train,X_test,ngram_range,analyzer,use_idf):
    #perform tfidf vectorization 
    
    #initialize values and tfidf-vectorizer
    min_df,max_df,max_features = 10, 1., 300
    tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                            analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                           use_idf=use_idf)
    
    #transform text data to tfidf representation
    features_train = tfidf.fit_transform(X_train).toarray()
    features_test = tfidf.transform(X_test).toarray()
    return features_train,features_test

def perform_countvectorizer(X_train,X_test,ngram_range,analyzer):
    #initialize values for countvectorizer
    min_df,max_df,max_features = 10, 1., 300
   
    countvect = CountVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features)
    
    #transform text data to binary features
    features_train = countvect.fit_transform(X_train).toarray()
    features_test = countvect.transform(X_test).toarray()
    return features_train,features_test    

def perform_info_gain_df(X_train,X_test,labels,ngram_range,analyzer):
    from info_gain import info_gain
    
    #initialize values and countvectorizer
    X_train=X_train.astype(str)
    X_test=X_test.astype(str)
    min_df,max_df,max_features = 10, 1., 300
    countvect = CountVectorizer(encoding='utf-8',ngram_range=ngram_range,analyzer=analyzer,stop_words=None,
                                lowercase=False,max_df=max_df,min_df=min_df,max_features=max_features,binary=True)
  
    #transform text data to binary representation
    dftrain = pd.DataFrame(countvect.fit_transform(X_train).toarray(), columns=countvect.get_feature_names())
    dftest = pd.DataFrame(countvect.transform(X_test).toarray(), columns=countvect.get_feature_names())
    
    dftrain['RESPONSIVE']=labels
    
    #initialize dictonary with info gain per feature
    ig_dict={}
    for column in dftrain.columns:
        ig  = info_gain.info_gain(dftrain['RESPONSIVE'], dftrain[column])
        ig_dict[column]=ig
    
    for column in dftrain.columns:
        info_gain=ig_dict[column]
        dftrain[column]=np.where(dftrain[column] == 0, 0, info_gain)
    for column in dftest.columns:
        info_gain=ig_dict[column]
        dftest[column]=np.where(dftest[column] == 0, 0, info_gain)
    return dftrain.loc[:, dftrain.columns != 'RESPONSIVE'],dftest

    
def train_classifier(data,model,iterations,cv,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
    #this function does al preprocessing steps and trains a given classifier in a standardized way
    
    #split train/test
    X_train, X_test, y_train, y_test = train_test_split(data['TOKENS'],   #ALLEEN VOOR ENRON
                                                    data['RESPONSIVE'], 
                                                    test_size=0.15)
    X_train=X_train.astype(str)
    X_test=X_test.astype(str)    #weet niet zeker of dit moet
    
    #extract n-grams from the text
    ngram_range = extract_ngram(text_representation)
    
    # perform a text representation method on the data
    if feature_weighting =='TF-IDF':
        features_train,features_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),True)
    elif feature_weighting =='TF':
        features_train,features_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),False)
    elif feature_weighting=='BINARY':
        features_train,features_test=perform_countvectorizer(X_train,X_test,ngram_range,wordchar.lower())
    elif feature_weighting=='INFO-GAIN':
        features_train,features_test=perform_info_gain_df(X_train,X_test,y_train,ngram_range,wordchar.lower())
    
    labels_train = y_train
    labels_test = y_test
    
    #determine usage of SMOTE
    if smoteennyes:
        features_train, labels_train = SMOTEENN().fit_resample(features_train, labels_train)
        
    #determine usage of undersampling    
    if underyes:
        features_train, labels_train =RandomUnderSampler(sampling_strategy=1).fit_resample(features_train,labels_train)
    
    #determine usage of class weights
    if classweightyes:
        class_weight='balanced'
    else:
        class_weight=None
    
    #initialize estimator
    if model=='svm':
        estimator= SVC(random_state=8)
        random_grid=svm_grid
    elif model=='rf':
        estimator= RandomForestClassifier(random_state=8)
        random_grid=rf_grid
    elif model=='cnb':
        estimator= ComplementNB()
        random_grid=cnb_grid
    elif model=='xgb':
        estimator=XGBClassifier(random_state=8)
        ratio_w=len(data[data['RESPONSIVE']==0])/len(data[data['RESPONSIVE']==1])
        random_grid=xgb_grid
    
    # init the random search algorithm
    random_search = RandomizedSearchCV(estimator=estimator,
                                   param_distributions=random_grid,
                                   n_iter=iterations,
                                   scoring=ftwo_scorer,
                                   cv=cv, 
                                   verbose=1, 
                                   refit=True)
    
    #check for usage of sample weights and fit random search algorithm
    if sampleweightyes:
        sample_weight = compute_sample_weight(class_weight='balanced', y=labels_train)
        random_search.fit(features_train, labels_train,sample_weight=sample_weight)
    else:
        random_search.fit(features_train, labels_train)
   
    y_pred=random_search.predict(features_test)
    y_probas=random_search.predict_proba(features_test)
    
    f2_score_before_thresholding=fbeta_score(labels_test,y_pred,beta=2)
    p, r, thresholds = precision_recall_curve(y_test, y_probas[:,1])

    f2_list=[]
    
    #calculate f2 scores for each threshold
    for i in range(0,len(p)):
        f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
    
    #find optimal threshold
    t=thresholds[f2_list.index(max(f2_list))]
    
    #calculate new labels
    new_labels=adjusted_classes(y_probas[:,1],t)
    
    #calculate f2 score after thresholding
    f2_score=fbeta_score(y_test,new_labels,beta=2)
    
    return f1_score(labels_test,y_pred),f2_score,f2_score_before_thresholding,random_search.best_params_


# init variable values and create parameter grid for estimators
class_weight='balanced'
ratio_w=1

svm_grid = {'C': [.01,.1,.4,.6,1,1.5],
              'kernel': [ 'rbf', 'poly','linear'],
              'gamma':  [.01, .1, 1],
              'degree': [1, 2, 3, 4, 5],
              'probability': [True],
              'class_weight':[class_weight]
             }

rf_grid={'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False],
                'class_weight':['balanced']}

def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    """
    return [1 if y >= t else 0 for y in y_scores]


ftwo_scorer = make_scorer(fbeta_score, beta=2)