## Generate final table containing mean and standard deviation results for each selected datasets

Run the following code to generate the final table containing the mean and std for each sampling method for a given dataset and scores

Prerequisite: the file parameters.xls has been completed -- If not, run the GenerateParams notebook.

In [1]:
# import oversampling methods
from oversampling_methods.assembled_smote import assembled_smote
from oversampling_methods.cluster_smote import cluster_smote
from oversampling_methods.cure_smote import cure_smote
from oversampling_methods.smote import smote
from oversampling_methods.dbsmote import dbsmote
from oversampling_methods.de_oversampling import de_oversampling
from oversampling_methods.gsmote import gsmote
from oversampling_methods.kmeans_smote import kmeans_smote
from oversampling_methods.lee import lee
from oversampling_methods.polynom_fit_smote import polynom_fit_smote
from oversampling_methods.prowsyn import prowsyn
from oversampling_methods.smobd import smobd
from oversampling_methods.smote_ipf import smote_ipf
from oversampling_methods.wssmote import wssmote
## Add here the oversampling method you want to test

import itertools
import openpyxl
import numpy as np
import pandas as pd
import pickle

# import classifiers
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from smote_variants import MLPClassifierWrapper
from sklearn.ensemble import AdaBoostClassifier

from tools.scores import generate_score
import sys

In [2]:
# Open the excel file where parameters have been computed in the previous part
wb = openpyxl.load_workbook("C:/Users/oucht/Documents/These/Pipeline_Variants/parameters.xlsx")

# Given data file
name_files = ['presev_features2']

# Path where results will be saved
FilePath = 'C:/Users/oucht/Documents/These/Pipeline_Variants/resultats.xlsx'
ExcelWorkbook = openpyxl.load_workbook(FilePath)
writer = pd.ExcelWriter(FilePath, engine = 'openpyxl')
writer.book = ExcelWorkbook

In [3]:
for name_f in name_files:
    dico = dict()
    for cv in range(5):
        # change the path to the folder with your data
        a_file = open("C:/Users/oucht/Documents/These/Pipeline_Variants/data/save_pickle/{0}_cv{1}.pkl".format(name_f, cv), "rb")
        output = pickle.load(a_file)
        train, test, train_labels, test_labels = output['train'], output['test'], output['train_labels'], output['test_labels']

        match = [s for s in wb.sheetnames if name_f in s]
        scores_study = [m.split('_')[0] for m in match]
        
        for s in scores_study: 
            if s not in dico.keys():
                dico[s] = dict()
            data = pd.DataFrame(wb[match[0]].values)
            data = pd.DataFrame(wb[match[0]].values)
            del data[0]
            data.index = ['preprocessing', 'params_preprocessing', 'machine_learning', 'machine_learning_params']
            for ind in range(1, len(data.columns)+1):
                array = data[ind]
                param = eval(array['params_preprocessing'])
                name_preproce = array['preprocessing'].split('\'')[1].split('.')[2]
                if name_preproce not in dico[s].keys():
                    dico[s][name_preproce] = []
                sm = getattr(sys.modules[__name__], name_preproce)(**param)
                train_over, train_labels_over = sm.fit_sample(train, train_labels)

                params_class = eval(array['machine_learning_params'])
                if 'smote' not in array['machine_learning'] and 'CalibratedClassifierCV' not in array['machine_learning']:
                    classifier = getattr(sys.modules[__name__], array['machine_learning'].split('(')[0])(**params_class)
                elif 'CalibratedClassifierCV' in array['machine_learning']:
                    classifier = CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False))
                else:
                    classifier = MLPClassifierWrapper(**params_class)

                classifier.fit(train_over, train_labels_over)
                predictions = classifier.predict(test)
                predictions_proba = classifier.predict_proba(test)
                dico[s][name_preproce].append(generate_score(s, test_labels, predictions, predictions_proba))
                
    for s in dico.keys():
        for m in dico[s].keys():
            dico[s][m] = str('{0} +/- {1}').format(np.round(np.mean(dico[s][m])*100, 3), np.round(np.std(dico[s][m])*100, 3))
    
    res = pd.DataFrame(dico)
    res.to_excel(writer, sheet_name = "{}".format(name_f))
    writer.save()
writer.close()







