In [7]:
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_auc_score,matthews_corrcoef, precision_recall_curve
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

Using TensorFlow backend.


#  Combine optimal classifier with meta-features (stacking) Scenario 1 : M1+M2+..+T

## Create integrated metafeatures+text prediction dataframe

In [8]:
# create predictions from text classifier per row and save it as feature
def stack_and_validate(datasets,datasets_meta, datasets_names):
    
    score_df=pd.DataFrame(columns=['Name metadataset','SMOTEENN','Undersampling','Classweights','Sampleweights','F2-score','roc_auc score','precision','fpr','fnr'])
    count_dataset_name=0
    count=0
    for dataset_meta in datasets_meta:
        
        for undersampling in [True]:
            
            #initialize score lists
            f2_scores=[]
            roc_auc_scores=[]
            precision_scores=[]
            fpr_scores=[]
            fnr_scores=[]
            
            #test iterations
            for i in range(0,20):
                
                #obtain stacked dataset
                stacked=stack_and_obtain(datasets[count_dataset_name],datasets_names[count_dataset_name])
                
                #integrate with metadata
                X_train_meta,y_train_meta,X_test_meta,y_test_meta=create_integrated_dataframe(datasets[count_dataset_name],datasets_names[count_dataset_name],dataset_meta,stacked)
    
                #prepare integrated dataset for training
                X_train_meta,y_train_meta,class_weight=do_prep_steps(X_train_meta,y_train_meta,True,undersampling,True)
            
                #init combiner function (logistic regression)
                estimator= LogisticRegression(random_state=0)
                
                #init random search algorithm and fit on data
                random_search = RandomizedSearchCV(estimator=estimator, param_distributions=logreg_grid,n_iter=10,scoring=ftwo_scorer, cv=4, verbose=1, refit=True)
                random_search.fit(X_train_meta, y_train_meta)
                
                #predict labels and probabilities
                y_pred=random_search.predict(X_test_meta)
                y_probas=random_search.predict_proba(X_test_meta)
                
                #calculate f2 score before thresholding and save in score list
                f2_score=fbeta_score(y_test_meta,y_pred,beta=2)
                f2_scores.append(f2_score)
                
                #save roc-auc score, precision, fpr and fnr
                roc_auc_scores.append(roc_auc_score(y_test_meta,y_probas[:,1]))
                tn, fp, fn, tp = confusion_matrix(y_test_meta,y_pred).ravel()
                precision=tp/(tp+fp)
                fpr = fp/(fp+tn)
                fnr=fn/(fn+tp)
                
                #save coefficients of features in combiner function for explanation
                indexje=np.argmax(random_search.best_estimator_.coef_)
                coefficients = pd.concat([pd.DataFrame(X_train_meta.columns),pd.DataFrame(np.transpose(random_search.best_estimator_.coef_))], axis = 1)
                
                #save scores in score lists
                precision_scores.append(precision)
                fpr_scores.append(fpr)
                fnr_scores.append(fnr)
           
                score=np.max(f2_scores)
            
            score_df.loc[count]=[datasets_names[count_dataset_name],True,undersampling,True,False,np.mean(f2_scores),np.mean(roc_auc_scores),np.mean(precision_scores),np.mean(fpr_scores),np.mean(fnr_scores)]
            count+=1
        count_dataset_name+=1
    return score_df

# parameter grid for logistic regression combiner function
logreg_grid={'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}

In [4]:
def create_integrated_dataframe(dataset,dataset_name,dataset_meta,stacked):
    #integrate metadata and text predictions
    
    #call stacked dataframe
    stacked=stack_and_obtain(dataset,dataset_name)
    
    optimal_index=pd.to_numeric(stacked['F2-score']).idxmax()
    X_train_meta=dataset_meta.loc[stacked.loc[optimal_index]['train_ids'],dataset_meta.columns != 'RESPONSIVE'] #hierietsveranderd
    y_train_meta=dataset_meta.loc[stacked.loc[optimal_index]['train_ids'],dataset_meta.columns == 'RESPONSIVE']
    X_test_meta=dataset_meta.loc[stacked.loc[optimal_index]['test_ids'],dataset_meta.columns != 'RESPONSIVE']
    y_test_meta=dataset_meta.loc[stacked.loc[optimal_index]['test_ids'],dataset_meta.columns == 'RESPONSIVE']

    X_train_meta['Text predictions']=stacked.loc[optimal_index]['train pred']
    X_test_meta['Text predictions']=stacked.loc[optimal_index]['test pred']
    
    return X_train_meta,y_train_meta,X_test_meta,y_test_meta

In [9]:
def stack_and_obtain(dataset,dataset_name):
    #obtain text classifier predictions and stack on metadata-features
    
    #initialize columns
    optimal_configs=optimal_dict[dataset_name]
    optimal_configs['train pred']=''
    optimal_configs['test pred']=''
    optimal_configs['pogingen']=''
    optimal_configs['test_score']=''
    optimal_configs['train_ids']=''
    optimal_configs['test_ids']=''
    optimal_configs=optimal_configs.astype('object')
  
    optimal_index=pd.to_numeric(optimal_configs['F2-score']).idxmax()
    
    row=optimal_configs.loc[optimal_index]
    index=optimal_index
    test_f2=0
    pogingen=0
    
    foutmarge=row['F2-score']-0.05
   
    while test_f2<row['F2-score']-foutmarge:
        pogingen+=1
        
        #split train/test
        X_train, X_test, y_train, y_test = train_test_split(dataset['TOKENS'], dataset['RESPONSIVE'], test_size=0.15,stratify=dataset['RESPONSIVE'])
        
        #save indices
        optimal_configs.loc[index,'train_ids']=X_train.index
        optimal_configs.loc[index,'test_ids']=X_test.index
        
        #extract n-gram representations
        ngram_range = extract_ngram(row['Text representation'])
        
        #apply feature weighting
        X_train,X_test=perform_feature_weighting(row['Weighting'],X_train,X_test,ngram_range,row['Word/char'].lower())
         
        #obtain predictions and stack them on metadata
        test_pred,train_pred,test_f2=stacking(X_train,y_train,X_test,y_test,8,row['Sampleweights'],row)
        
        #save predictions
        X_train['Predicted']=train_pred
        X_test['Predicted']=test_pred
        
    #update database
    optimal_configs.loc[index,'train pred']=train_pred
    optimal_configs.loc[index,'test pred']=test_pred
    optimal_configs.loc[index,'pogingen']=str(pogingen)
    optimal_configs.loc[index,'test_score']=str(test_f2)

    return optimal_configs

def stacking(train,y,test,y_test,n_fold,sampleweightyes,row):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    j=0
    for train_indices,val_indices in folds.split(train,y.values):
        j+=1
        
        #slice out indices
        X_train,X_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
        
        #prepare datasets for training
        X_train,y_train,class_weight=do_prep_steps(X_train,y_train,row['Smoteenn'],row['Undersampling'],row['Classweights'])
        
        #select correct model
        model=select_model(y_train,row['best params'],class_weight,row['Algorithm'])
        
        #fit model
        model=fit_model(sampleweightyes,model,X_train,y_train)
        
        #predict labels and probabilities
        y_pred=model.predict(X_val)    
        y_probas=model.predict_proba(X_val)
        
        #find opt threshold
        threshold=find_optimal_threshold(y_val,y_probas)
        
        #predict new labels
        new_labels=adjusted_classes(y_probas[:,1],threshold)
        
        #f2 score after thresholding
        f2_score=fbeta_score(y_val,new_labels,beta=2)
        
        #append text predictions to use it as feature later on
        train_pred=np.append(train_pred,y_probas[:,1])
        
        #predict on test
        if j==n_fold:
            test_pred,test_f2=predict_on_test(model,train,test,y,sampleweightyes,y_test)
    return test_pred,train_pred,test_f2


def predict_on_test(model,X_train,X_test,y_train,sampleweightyes,y_test):
    #train en validate
    X_train_1, X_val, y_train_1, y_val=train_test_split(X_train,y_train,test_size=0.15,stratify=y_train)    
    model=fit_model(sampleweightyes,model,X_train_1,y_train_1)
    y_probas=model.predict_proba(X_val)
    t=find_optimal_threshold(y_val,y_probas)
    
    #found t, now train on train and predict on test using t
    model=fit_model(sampleweightyes,model,X_train,y_train)
    y_probas=model.predict_proba(X_test)
    test_labels=adjusted_classes(y_probas[:,1],t)
    test_f2=fbeta_score(y_test,test_labels,beta=2)
    return y_probas[:,1],test_f2

#initialize f2 scorer function
ftwo_scorer = make_scorer(fbeta_score, beta=2)
    
    
def fit_model(sampleweightyes,model, X_train,y_train):
    if sampleweightyes:
        sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
        model.fit(X_train, y_train,sample_weight=sample_weight)
    else:
        model.fit(X_train,y_train)
    return model
    
def find_optimal_threshold(y_val,y_probas):
    p, r, thresholds = precision_recall_curve(y_val, y_probas[:,1])
    f2_list=[]
    for i in range(0,len(p)):
        f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
    t=thresholds[f2_list.index(max(f2_list))]
    return t
    
def perform_feature_weighting(feature_weighting,X_train,X_val,ngram_range,wordchar):
    X_train=X_train.astype(str)
    X_val=X_val.astype(str)
    
    # perform a text representation method on the data
    if feature_weighting =='TF-IDF':
        X_train,X_val=perform_tf_idf(X_train,X_val,ngram_range,wordchar,True)
    elif feature_weighting =='TF':
        X_train,X_val=perform_tf_idf(X_train,X_val,ngram_range,wordchar,False)
    elif feature_weighting=='BINARY':
        X_train,X_val=perform_countvectorizer(X_train,X_val,ngram_range,wordchar)
    else:
        print('Feature weighting niet gelukt')
    return X_train,X_val

def do_prep_steps(X_train,y_train,smoteennyes,underyes,classweightyes):
    #determine usage of SMOTE
    if smoteennyes:
        features_train, labels_train = SMOTEENN().fit_resample(X_train,y_train)
        
    #determine usage of undersampling    
    if underyes:
        features_train, labels_train =RandomUnderSampler(sampling_strategy=1).fit_resample(X_train,y_train)
    
    #determine usage of class weights
    if classweightyes:
        class_weight='balanced'
    else:
        class_weight=None
    return X_train,y_train,class_weight

def select_model(y_train,best_params,class_weight,algorithm):
    #initialize estimator
    if algorithm=='svm':
        estimator= SVC(C=best_params['C'],kernel=best_params['kernel'],gamma=best_params['gamma'],degree=best_params['degree'],probability=best_params['probability'],class_weight=class_weight)
    elif algorithm=='rf':
        estimator= RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],bootstrap=best_params['bootstrap'],class_weight='balanced')
    elif algorithm=='cnb':
        estimator= ComplementNB(alpha=best_params['alpha'],fit_prior=best_params['fit_prior'],norm=best_params['norm'])
    elif algorithm=='xgb':
        ratio_w=y_train.value_counts()[0]/y_train.value_counts()[1]
        estimator=XGBClassifier(min_child_weight=best_params['min_child_weight'],gamma=best_params['gamma'],subsample=best_params['subsample'],colsample_bytree=best_params['colsample_bytree'],max_depth=best_params['max_depth'], scale_pos_weight=ratio_w)
    return estimator

def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    """
    return [1 if y >= t else 0 for y in y_scores]

#  Vectorizers

In [10]:
def extract_ngram(text_representation):
    #transform string text representation to suitable format for sklearn vectorizers
    if text_representation=='UNIGRAM':
        ngram_range=(1,1)
    elif text_representation=='BIGRAM':
        ngram_range=(1,2)
    elif text_representation=='TRIGRAM':
        ngram_range=(1,3)
    return ngram_range

def perform_tf_idf(X_train,X_test,ngram_range,analyzer,use_idf):
    #perform tfidf vectorization 
    
    #initialize values and tfidf-vectorizer
    min_df,max_df,max_features = 10, 1., 300
    tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                            analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                           use_idf=use_idf)
    
    #transform text data to tfidf representation
    features_train = tfidf.fit_transform(X_train).toarray()
    features_test = tfidf.transform(X_test).toarray()
    return features_train,features_test

def perform_countvectorizer(X_train,X_test,ngram_range,analyzer):
    #initialize values for countvectorizer
    min_df,max_df,max_features = 10, 1., 300
   
    countvect = CountVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features)
    
    #transform text data to binary features
    features_train = countvect.fit_transform(X_train).toarray()
    features_test = countvect.transform(X_test).toarray()
    return features_train,features_test    

# Testing function

In [8]:
def do_testing(data,model,best_params,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
    # testing function that does unique train/test split and applies already found parameters for validation
    
    #split train/test
    X_train, X_test, y_train, y_test = train_test_split(data['TOKENS'], 
                                                    data['RESPONSIVE'], 
                                                    test_size=0.15,stratify=data['RESPONSIVE'])
                                                    
    #split train/val
    X_train_1, X_val, y_train_1, y_val=train_test_split(X_train,  
                                                    y_train, 
                                                    test_size=0.15,stratify=y_train)               
    
    
    def fit_and_val_model(model,best_params,X_train_1,y_train_1,X_val,y_val,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
        #fit and validation function
        
        
        X_train_1,X_val=X_train_1.astype(str),X_val.astype(str) 
        
        #extract ngram representation
        ngram_range = extract_ngram(text_representation)
        
        #perform vectorization of text data
        if feature_weighting =='TF-IDF':
            X_train_1,X_val=perform_tf_idf(X_train_1,X_val,ngram_range,wordchar.lower(),True)
        elif feature_weighting =='TF':
            X_train_1,X_val=perform_tf_idf(X_train_1,X_val,ngram_range,wordchar.lower(),False)
        elif feature_weighting=='BINARY':
            X_train_1,X_val=perform_countvectorizer(X_train_1,X_val,ngram_range,wordchar.lower())
        elif feature_weighting=='INFO-GAIN':
            X_train_1,X_val=perform_info_gain_df(X_train_1,X_val,y_train_1,ngram_range,wordchar.lower())
            
        y_train_1,y_val = y_train_1,y_val
        
        #check whether SMOTE has to be performed
        if smoteennyes:
            X_train_1, y_train_1 = SMOTEENN().fit_resample(X_train_1, y_train_1)
        
        #check whether undersampling has to be performed
        if underyes:
            X_train_1,y_train_1 =RandomUnderSampler(sampling_strategy=1).fit_resample(X_train_1,y_train_1)
        
        #check whether classweights have to be applied
        if classweightyes:
            class_weight='balanced'
        else:
            class_weight=None
        
        #check which estimator will be used and use optimal parameters
        if model=='svm':
            estimator= SVC(C=best_params['C'],kernel=best_params['kernel'],gamma=best_params['gamma'],degree=best_params['degree'],probability=best_params['probability'],class_weight=class_weight)
        elif model=='rf':
            estimator= RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],bootstrap=best_params['bootstrap'],class_weight='balanced')
        elif model=='cnb':
            estimator= ComplementNB(alpha=best_params['alpha'],fit_prior=best_params['fit_prior'],norm=best_params['norm'])
        elif model=='xgb':
            ratio_w=y_train_1.value_counts()[0]/y_train_1.value_counts()[1]
            estimator=XGBClassifier(min_child_weight=best_params['min_child_weight'],gamma=best_params['gamma'],subsample=best_params['subsample'],colsample_bytree=best_params['colsample_bytree'],max_depth=best_params['max_depth'], scale_pos_weight=ratio_w)
         
        #check whether sample weights have to be applied and fit estimator
        if sampleweightyes:
            sample_weight = compute_sample_weight(class_weight='balanced', y=y_train_1)
            estimator.fit(X_train_1, y_train_1,sample_weight=sample_weight)
        else:
            estimator.fit(X_train_1,y_train_1)
        
        #predict labels
        y_pred=estimator.predict(X_val)
        
        #predict probabilities
        y_probas=estimator.predict_proba(X_val)
        
        #calculate f2 score before thresholding
        f2_score_before_thresholding=fbeta_score(y_val,y_pred,beta=2)
        
        #calculate optimal threshold
        p, r, thresholds = precision_recall_curve(y_val, y_probas[:,1])
        f2_list=[]
        for i in range(0,len(p)):
            f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
        t=thresholds[f2_list.index(max(f2_list))]
        
        #calculate new labels and optimal f2 score
        new_labels=adjusted_classes(y_probas[:,1],t)
        f2_score=fbeta_score(y_val,new_labels,beta=2)
        return t

    def test_model(model,best_params,X_train,y_train,X_test,y_test,t,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):
        #test model for final validation
        
        
        X_train,X_test=X_train.astype(str),X_test.astype(str) 
        
        #extract ngram representation
        ngram_range = extract_ngram(text_representation)
        
        #perform vectorization of text data
        if feature_weighting =='TF-IDF':
            X_train,X_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),True)
        elif feature_weighting =='TF':
            X_train,X_test=perform_tf_idf(X_train,X_test,ngram_range,wordchar.lower(),False)
        elif feature_weighting=='BINARY':
            X_train,X_test=perform_countvectorizer(X_train,X_test,ngram_range,wordchar.lower())
        elif feature_weighting=='INFO-GAIN':
            X_train,X_test=perform_info_gain_df(X_train,X_test,y_train,ngram_range,wordchar.lower())   
        
        #check whether SMOTE has to be performed
        if smoteennyes:
            X_train, y_train = SMOTEENN().fit_resample(X_train, y_train)
        
        #check whether undersampling has to be performed
        if underyes:
            X_train,y_train =RandomUnderSampler(sampling_strategy=1).fit_resample(X_train,y_train)
        
        #check whether classweights have to be applied
        if classweightyes:
            class_weight='balanced'
        else:
            class_weight=None
       
        #check which estimator will be used and use optimal parameters
        if model=='svm':
            estimator= SVC(C=best_params['C'],kernel=best_params['kernel'],gamma=best_params['gamma'],degree=best_params['degree'],probability=best_params['probability'],class_weight=class_weight)
        elif model=='rf':
            estimator= RandomForestClassifier(n_estimators=best_params['n_estimators'],max_depth=best_params['max_depth'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],bootstrap=best_params['bootstrap'],class_weight='balanced')
        elif model=='cnb':
            estimator= ComplementNB(alpha=best_params['alpha'],fit_prior=best_params['fit_prior'],norm=best_params['norm'])
        elif model=='xgb':
            ratio_w=y_train.value_counts()[0]/y_train.value_counts()[1]
            estimator=XGBClassifier(min_child_weight=best_params['min_child_weight'],gamma=best_params['gamma'],subsample=best_params['subsample'],colsample_bytree=best_params['colsample_bytree'],max_depth=best_params['max_depth'], scale_pos_weight=ratio_w)
         
        #check whether sample weights have to be applied and fit estimator
        if sampleweightyes:
            sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
            estimator.fit(X_train, y_train,sample_weight=sample_weight)
        else:
            estimator.fit(X_train,y_train)
        
        #predict labels
        y_pred=estimator.predict(X_test)
        
        #predict probabilities
        y_probas=estimator.predict_proba(X_test)
        
        #calculate f2 score before thresholding
        f2_score_without_thresholding=fbeta_score(y_test,y_pred,beta=2)
        
        #calculate new labels and optimal f2 score
        new_labels=adjusted_classes(y_probas[:,1],t)
        f2_score=fbeta_score(y_test,new_labels,beta=2)
        
        #calculate roc-auc score,precision,fpr,fnr and recall
        roc_score=roc_auc_score(y_test,y_probas[:,1])
        tn, fp, fn, tp = confusion_matrix(y_test,new_labels).ravel()
        precision=tp/(tp+fp)
        fpr = fp/(fp+tn)
        fnr=fn/(fn+tp)
        recall=tp/(tp+fn)
        
        return f2_score, f2_score_without_thresholding,roc_score,precision,fpr,fnr,recall
    
    #fit and validate model and use optimal threshold of training data on test data
    t=fit_and_val_model(model,best_params,X_train_1,y_train_1,X_val,y_val,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting)
    return test_model(model,best_params,X_train,y_train,X_test,y_test,t,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting)



def testing_statistics(data,model,best_params,iterations,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting):  #oneven aantal!!!
    #this function validates the model x iterations and saves all scores
    
    f2_thresh_scores=[]
    f2_no_thresh_scores=[]
    roc_scores=[]
    precision_scores=[]
    fpr_scores=[]
    fnr_scores=[]
    recall_scores=[]
    
    for iter in range(0,iterations):
        f2_thresh,f2_no_thresh,roc_score,precision,fpr,fnr,recall=do_testing(data,model,best_params,smoteennyes,underyes,classweightyes,sampleweightyes,text_representation,wordchar,feature_weighting)
        f2_thresh_scores.append(f2_thresh)
        f2_no_thresh_scores.append(f2_no_thresh)
        roc_scores.append(roc_score)
        precision_scores.append(precision)
        fpr_scores.append(fpr)
        fnr_scores.append(fnr)
        recall_scores.append(recall)
    return f2_thresh_scores, f2_no_thresh_scores,roc_scores,precision_scores,fpr_scores,fnr_scores,recall_scores