In [1]:
import pandas as pd
import numpy as np
import math
import xgboost as xgb
import json

import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join

from scipy import spatial
from pymfe.mfe import MFE

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
json_result = {}
json_result['datasets'] = []
json_mfe_result = {}
json_mfe_result['datasets'] = []
onlyfiles = [f for f in listdir('RealDatasets') if isfile(join('RealDatasets', f))]

In [3]:
def checkIfExistsInJson(data_name):
    file = open('result.json')
    d = json.load(file)
    if('datasets' in d):
            for p in d['datasets']:
                if(p['name'] == data_name):
                    print("Já tem!")
                    return True
    return False    

def build_json_object(dataset_name, default_params_result, best_params_result, params):
    json_result['datasets'].append({
        'name': dataset_name,
        'default_params_result': default_params_result,
        'best_params_result': best_params_result,
        'params': {
            'max_depth': params['max_depth'],
            'min_child_weight': params['min_child_weight'],
            'gamma': params['gamma'],
            'subsample': params['subsample'],
            'colsample_bytree': params['colsample_bytree']
        }
    })
    
def build_json_mfe_object(dataset_name, mfe):
    json_mfe_result['datasets'].append({
        'name': dataset_name,
        'mfe': mfe
    })

In [4]:
# mfe_groups = ['general', 'statistical', 'info-theory']
mfe_groups = ['info-theory']

def mfe_extract(dataframe):
    y = dataframe['target'].tolist()
    X = dataframe.drop('target', axis=1).values
    
    mfe = MFE(mfe_groups)
    mfe.fit(X, y)
    ft = mfe.extract()
    
    result = {}
    for i in range(0, len(ft[0])):
        if(str(ft[1][i]) != 'nan'):
            result[ft[0][i]] = np.float64(ft[1][i])
    
    return result

In [5]:
def xgboost(dataframe):
    estimator = xgb.XGBClassifier() 
    
    X = dataframe[dataframe.columns[:-1]]
    y = dataframe['target']
    
    k_folds = 5 
    for i in df['target'].value_counts():
        if(i < k_folds):
            k_folds = i
    
    if(k_folds == 1):
        return(0.0, 0.0, 0.0)
#     Parâmetros reduzidos
    parameters = {
        'max_depth': range(3, 8, 1), # Maximum depth = more overfit
        'min_child_weight': range(1, 3, 1), 
        'gamma': [0, 0.2, 0,5, 1],
        'subsample': [0.2, 0.5, 1.0],
        'colsample_bytree': [0.2, 0.5, 1.0]
    }
    
    default_parameters = {
        'max_depth': [6], 
        'min_child_weight': [1], 
        'gamma': [0],
        'subsample': [1],
        'colsample_bytree': [1]
    }
    
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=default_parameters,
        cv=k_folds
    )
    grid_search.fit(X, y)
    
    default_value = grid_search.best_score_*100
    
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=parameters,
        cv=k_folds
    )
    grid_search.fit(X, y)
    
    value = grid_search.best_score_*100
    
    # Default params accuracy / best params accuracy / params
    return (default_value, value, grid_search.best_params_)

In [6]:
def euclidean(v1, v2):
    return sum((p-q)**2 for p, q in zip(v1, v2)) ** .5

def calculates_metafeature_similarity(listA, listB):
    arrayAuxA = []
    arrayAuxB = []
        
    for i in listA['mfe']:
        for j in listB['mfe']:
            if(i == j):
                arrayAuxA.append(float(listA['mfe'][i]))
                arrayAuxB.append(float(listB['mfe'][j]))
                
    return euclidean(arrayAuxA, arrayAuxB)

In [7]:
# Descomentar para rodar o código Xgboost + Mfe + Colocar no json

# Dataset Dataset:  46  -  dataset-autoHorse_fixed_42223.csv com problema
# Dataset 48 tambem
for i in range(0, len(onlyfiles)):
    print('Dataset: ', i, ' - ', onlyfiles[i])
    path = ('RealDatasets/' + onlyfiles[i])
    df = pd.read_csv(path)
    df.columns = [*df.columns[:-1], 'target']
    
    result = xgboost(df)
    mfe = mfe_extract(df)

    if(checkIfExistsInJson(onlyfiles[i]) == False and result[0] != 0):
        build_json_object(onlyfiles[i], result[0], result[1], result[2])
        build_json_mfe_object(onlyfiles[i], mfe)
        
    build_json_mfe_object(onlyfiles[i], mfe)

Dataset:  0  -  acute-inflammations_1556.csv




Dataset:  1  -  allbp_40707.csv




Dataset:  2  -  analcatdata_authorship_458.csv




Dataset:  3  -  analcatdata_challenger_1013.csv




Dataset:  4  -  analcatdata_chlamydia_875.csv




Dataset:  5  -  analcatdata_creditscore_461.csv




Dataset:  6  -  analcatdata_germangss_1025.csv




Dataset:  7  -  analcatdata_happiness_40709.csv
Dataset:  8  -  analcatdata_lawsuit_450.csv




Dataset:  9  -  analcatdata_vineyard_724.csv
Dataset:  10  -  analcatdata_wildcat_748.csv




Dataset:  11  -  ar1_1059.csv




Dataset:  12  -  ar4_1061.csv




Dataset:  13  -  ar6_1064.csv




Dataset:  14  -  autoPrice_756.csv




Dataset:  15  -  autoUniv-au6-400_1551.csv




Dataset:  16  -  autoUniv-au7-500_1554.csv




Dataset:  17  -  auto_price_745.csv




Dataset:  18  -  backache_463.csv




Dataset:  19  -  badges2_1121.csv




Dataset:  20  -  blogger_1463.csv




Dataset:  21  -  blood-transfusion-service-center_1464.csv




Dataset:  22  -  bodyfat_778.csv
Dataset:  23  -  breast-cancer-dropped-missing-attributes-values_23499.csv




Dataset:  24  -  breast-tissue_1465.csv
Dataset:  25  -  breast-tissue_1559.csv
Dataset:  26  -  calendarDOW_40663.csv




Dataset:  27  -  car-evaluation_40664.csv




Dataset:  28  -  cardiotocography_1466.csv




Dataset:  29  -  cars1_40700.csv




Dataset:  30  -  CastMetal1_1447.csv




Dataset:  31  -  chatfield_4_820.csv
Dataset:  32  -  chscase_census2_909.csv
Dataset:  33  -  chscase_census3_908.csv
Dataset:  34  -  chscase_census4_907.csv
Dataset:  35  -  chscase_census5_906.csv
Dataset:  36  -  chscase_census6_900.csv
Dataset:  37  -  chscase_geyser1_895.csv
Dataset:  38  -  chscase_vine2_814.csv
Dataset:  39  -  clean1_40665.csv




Dataset:  40  -  cleveland-nominal_40711.csv




Dataset:  41  -  cleve_40710.csv




Dataset:  42  -  cloud_860.csv




Dataset:  43  -  collins_987.csv
Dataset:  44  -  corral_40669.csv




Dataset:  45  -  CostaMadre1_1446.csv




Dataset:  46  -  dataset-autoHorse_fixed_42223.csv




Dataset:  47  -  datatrieve_1075.csv




Dataset:  48  -  desc_datasets.csv
Dataset:  49  -  diggle_table_a2_818.csv
Dataset:  50  -  ecoli_1011.csv




Dataset:  51  -  ecoli_39.csv




Dataset:  52  -  ecoli_40671.csv




Dataset:  53  -  Engine1_4340.csv
Dataset:  54  -  fertility_1473.csv




Dataset:  55  -  fri_c0_100_10_808.csv
Dataset:  56  -  fri_c0_100_25_889.csv
Dataset:  57  -  fri_c0_100_50_850.csv
Dataset:  58  -  fri_c0_100_5_754.csv
Dataset:  59  -  fri_c0_250_10_763.csv
Dataset:  60  -  fri_c0_250_25_773.csv
Dataset:  61  -  fri_c0_250_50_732.csv
Dataset:  62  -  fri_c0_250_5_776.csv
Dataset:  63  -  fri_c0_500_10_943.csv
Dataset:  64  -  fri_c0_500_25_926.csv
Dataset:  65  -  fri_c0_500_50_888.csv
Dataset:  66  -  fri_c0_500_5_884.csv
Dataset:  67  -  fri_c1_100_10_789.csv
Dataset:  68  -  fri_c1_100_25_812.csv
Dataset:  69  -  fri_c1_100_50_876.csv
Dataset:  70  -  fri_c1_100_5_829.csv
Dataset:  71  -  fri_c1_250_10_935.csv
Dataset:  72  -  fri_c1_250_25_746.csv
Dataset:  73  -  fri_c1_250_50_769.csv
Dataset:  74  -  fri_c1_250_5_730.csv
Dataset:  75  -  fri_c1_500_10_824.csv
Dataset:  76  -  fri_c1_500_25_779.csv
Dataset:  77  -  fri_c1_500_50_766.csv
Dataset:  78  -  fri_c1_500_5_870.csv
Dataset:  79  -  fri_c2_100_10_762.csv
Dataset:  80  -  fri_c2_100_25_



Dataset:  116  -  glass_1005.csv




Dataset:  117  -  glass_41.csv




Dataset:  118  -  haberman_43.csv




Dataset:  119  -  hayes-roth_329.csv




Dataset:  120  -  hayes-roth_974.csv




Dataset:  121  -  heart-h_1565.csv




Dataset:  122  -  heart-long-beach_1512.csv




Dataset:  123  -  heart-statlog_53.csv




Dataset:  124  -  heart-switzerland_1513.csv




Dataset:  125  -  ionosphere_59.csv




Dataset:  126  -  iris_61.csv
Dataset:  127  -  jEdit_4.0_4.2_1073.csv




Dataset:  128  -  jEdit_4.2_4.3_1048.csv




Dataset:  129  -  kc1-binary_1066.csv




Dataset:  130  -  kc1-top5_1045.csv




Dataset:  131  -  kc3_1065.csv




Dataset:  132  -  KnuggetChase3_1448.csv




Dataset:  133  -  KungChi3_1441.csv




Dataset:  134  -  leaf_1482.csv




Dataset:  135  -  LED-display-domain-7digit_40496.csv




Dataset:  136  -  lowbwt_941.csv




Dataset:  137  -  machine_cpu_733.csv




Dataset:  138  -  mc2_1054.csv




Dataset:  139  -  MeanWhile1_1449.csv




Dataset:  140  -  mfeat-fourier_14.csv
Dataset:  141  -  mfeat-karhunen_16.csv
Dataset:  142  -  mfeat-pixel_40979.csv




Dataset:  143  -  MindCave2_1450.csv




Dataset:  144  -  monks-problems-2_334.csv




Dataset:  145  -  mu284_880.csv




Dataset:  146  -  mux6_40681.csv




Dataset:  147  -  mw1_1071.csv




Dataset:  148  -  no2_886.csv
Dataset:  149  -  parkinsons_1488.csv
Dataset:  150  -  pc1_req_1167.csv




Dataset:  151  -  planning-relax_1490.csv
Dataset:  152  -  plasma_retinol_915.csv




Dataset:  153  -  pm10_750.csv
Dataset:  154  -  prnn_fglass_952.csv




Dataset:  155  -  prnn_fglass_996.csv




Dataset:  156  -  prnn_synth_464.csv
Dataset:  157  -  pwLinear_721.csv




Dataset:  158  -  qsar-biodeg_1494.csv




Dataset:  159  -  qualitative-bankruptcy_1495.csv




Dataset:  160  -  rabe_266_782.csv
Dataset:  161  -  rmftsa_sleepdata_679.csv




Dataset:  162  -  robot-failures-lp4_1519.csv
Dataset:  163  -  robot-failures-lp5_1520.csv
Dataset:  164  -  sa-heart_1498.csv




Dataset:  165  -  seeds_1499.csv
Dataset:  166  -  seismic-bumps_1500.csv
Dataset:  167  -  servo_747.csv




Dataset:  168  -  sleuth_case2002_902.csv




Dataset:  169  -  Smartphone-Based_Recognition_of_Human_Activities_4153.csv
Dataset:  170  -  solar-flare_40686.csv




Dataset:  171  -  sonar_40.csv
Dataset:  172  -  SPECTF_1600.csv
Dataset:  173  -  SPECTF_337.csv
Dataset:  174  -  SPECT_336.csv




Dataset:  175  -  steel-plates-fault_1504.csv




Dataset:  176  -  synthetic_control_377.csv
Dataset:  177  -  tae_48.csv




Dataset:  178  -  tae_955.csv




Dataset:  179  -  teachingAssistant_1115.csv




Dataset:  180  -  tecator_851.csv
Dataset:  181  -  thoracic-surgery_1506.csv




Dataset:  182  -  thyroid-allhypo_40476.csv




Dataset:  183  -  thyroid-new_40682.csv
Dataset:  184  -  transplant_885.csv
Dataset:  185  -  triazines_788.csv




Dataset:  186  -  TuningSVMs_41976.csv




Dataset:  187  -  TuningSVMs_41977.csv




Dataset:  188  -  user-knowledge_1508.csv
Dataset:  189  -  vertebra-column_1524.csv
Dataset:  190  -  veteran_719.csv




Dataset:  191  -  vinnie_860.csv




Dataset:  192  -  visualizing_environmental_736.csv
Dataset:  193  -  visualizing_galaxy_925.csv
Dataset:  194  -  wholesale-customers_1511.csv




Dataset:  195  -  wine-quality-red_40691.csv
Dataset:  196  -  wine_187.csv
Dataset:  197  -  wine_973.csv
Dataset:  198  -  wisconsin_753.csv




Dataset:  199  -  yeast_181.csv




Dataset:  200  -  zoo_965.csv




In [8]:
with open('result.json', 'w') as outfile:
    json.dump(json_result, outfile)
with open('mfe_result.json', 'w') as outfile:
    json.dump(json_mfe_result, outfile)

In [9]:
# Segunda parte - Testando os hiperparâmetros entre os datasets
import copy
file = open('result.json')
file_mfe = open('mfe_result.json')
data = json.load(file)
data_mfe = json.load(file_mfe)
estimator = xgb.XGBClassifier()
optimized_datasets = 0

In [10]:
def testHyperparams(dataframe, data):
    params = data['params']
    
#     print('Params: ', params)
    
    X = dataframe[dataframe.columns[:-1]]
    y = dataframe['target']
    
    for i in params:
        if(type(params[i]) != type([])):
            params[i] = [(params[i])]
    
    grid_search = GridSearchCV(
        estimator = estimator,
        param_grid = params
    )
    grid_search.fit(X, y)
    answer = grid_search.best_score_*100
    
    return answer

def getDatasetInfo(name):
    d = ''
    for p in data['datasets']:
        if(p['name'] == name):
            d = p
    
    return d

def getDatasetMfeInfo(name):
    d = ''
    for p in data_mfe['datasets']:
        if(p['name'] == name):
            d = p
    
    return d

def getHyperparamsMean(hyperparams):
#     print('Hyper len: ', len(hyperparams))
#     print('K: ', len(hyperparams))
#     print('Original: ', hyperparams)
#     params_mean = hyperparams[0]['params']
    params_mean = copy.deepcopy(hyperparams[0]['params'])
# #     print(params_mean)
    for i in range(1, len(hyperparams)):
        params = hyperparams[i]['params']
        for j in params:
            params_mean[j][0] += params[j][0]
    for i in params_mean:
        params_mean[i][0] /= len(hyperparams)
        if(i == 'max_depth'):
            params_mean[i][0] = int(round(params_mean[i][0]))
    
#     print('media:', params_mean)
# #     for i in hyperparams:
# #         print(i['params'])
# #         for j in i['params']:
# #             print(j)
    return {"params": params_mean}

def getMode(a):
    return max(set(a), key = a.count)

def getHyperparamsMode(hyperparams):
    params_mode = {
        "max_depth": [],
        "min_child_weight": [],
        "gamma": [],
        "subsample": [],
        "colsample_bytree": []
    }
    
    for i in hyperparams:
        for j in i['params']:
            params_mode[j].append(i['params'][j][0])
            
#     for i in teste:
#         teste[i] = getMode([teste[i]])
    
    for i in params_mode:
        params_mode[i] = [getMode(params_mode[i])]
    
    return {"params": params_mode}
        

def testMeanAndModeAllHyperparams(df, hyperparams):
#     print(len(hyperparams))
    result = []
    hyperparams_mean = []
    hyperparams_mode = []
    
    for i in range(0, len(hyperparams)):
        atual = testHyperparams(df, getHyperparamsMean(hyperparams[0:i+1].copy()))
        hyperparams_mean.append(atual)
        atual = testHyperparams(df, getHyperparamsMode(hyperparams[0:i+1].copy()))
        hyperparams_mode.append(atual)
        
#     print('Resultados para a média: ')
#     for i in range(0, len(hyperparams_mean)):
#         print(i, ' - ', hyperparams_mean[i])
    
#     print('Resultados para a moda: ')
#     for i in range(0, len(hyperparams_mode)):
#         print(i, ' - ', hyperparams_mode[i])
    
    return(hyperparams_mean, hyperparams_mode)
        
def testMeanAndModeGoodHyperparams(df, good_hyperparams):
    result = []
    hyperparams_mean = []
    hyperparams_mode = []
    
    for i in range(0, len(good_hyperparams)):
        atual = testHyperparams(df, getHyperparamsMean(good_hyperparams[0:i+1].copy()))
        hyperparams_mean.append(atual)
        atual = testHyperparams(df, getHyperparamsMode(good_hyperparams[0:i+1].copy()))
        hyperparams_mode.append(atual)
        
    return(hyperparams_mean, hyperparams_mode)
#     for i in good_hyperparams:
#         print(i['params']['max_depth'])

def getBestNHyperparams(name, n):
    answer = []
    ordem = []
    d1 = getDatasetInfo(name)
    d1_mfe = getDatasetMfeInfo(name)
    
    results = []
    good_datasets = []
    
    for i in range(0, len(onlyfiles)):
        d2 = getDatasetInfo(onlyfiles[i])
        d2_mfe = getDatasetMfeInfo(onlyfiles[i])
        
        if(onlyfiles[i] == name):
            continue
            
        if(d2):
            ordem.append((calculates_metafeature_similarity(d1_mfe, d2_mfe), onlyfiles[i]))
    
    ordem.sort()
    
    qt = 0
    
    hyperparams = []
    good_hyperparams = []
    
    path = ('RealDatasets/' + name)
    df = pd.read_csv(path)
    df.columns = [*df.columns[:-1], 'target']
    
    j = 0
    for i in ordem[:n]:
        d2 = getDatasetInfo(i[1])
        atual = testHyperparams(df, d2)
        hyperparams.append(d2)
#         print(i[0], ' - ', i[1])
        
        maior = False
        if(float(atual) > float(d1['default_params_result'])):
            maior = True
            qt += 1
        
        if(maior == True):
            good_hyperparams.append(d2)
            
#         print(atual, maior)
        d = {"name": i[1],
            "distance": i[0],
            "result_with_hiperparameter": atual,
            "greater_than_default": maior
            }
        
#         d.update(sss)
        
        results.append(d)
        j += 1
        
    
#     print('\nQuantidade de maiores que o default: ', qt)

#     print(len(hyperparams))
#     print(len(good_hyperparams))
    
    result_all_hyperparams = testMeanAndModeAllHyperparams(df, hyperparams)
    result_good_hyperparams = testMeanAndModeGoodHyperparams(df, good_hyperparams)
    
#     for i in result_good_hyperparams:
#         print(i)

    results_according_k = []
        
    for i in range(0, len(result_all_hyperparams[0])):
        results_according_k.append({
            "k": i,
            "mean": result_all_hyperparams[0][i],
            "mode": result_all_hyperparams[1][i],
            "mean_good_datasets": result_good_hyperparams[0][i] if i < len(result_good_hyperparams[0]) else -1,
            "mode_good_datasets": result_good_hyperparams[1][i] if i < len(result_good_hyperparams[1]) else -1,
        })

    if(qt > 0):
        global optimized_datasets
        optimized_datasets += 1
        
        
    best_mean_result = -1
    best_mean_k_number = -1
    best_mode_result = -1
    best_mode_k_number = -1
    best_mean_good_datasets = -1
    best_mean_good_datasets_k_number = -1
    best_mode_good_datasets = -1
    best_mode_good_datasets_k_number = -1
    best_result_using_hyperparameter_directly = -1
    
    for i in results:
        best_result_using_hyperparameter_directly = max(best_result_using_hyperparameter_directly, i['result_with_hiperparameter'])
    
    for i in results_according_k:
        if(i['mean'] > best_mean_result):
            best_mean_result = i['mean']
            best_mean_k_number = i['k']
            
        if(i['mode'] > best_mode_result):
            best_mode_result = i['mode']
            best_mode_k_number = i['k']
            
        if(i['mean_good_datasets'] > best_mean_good_datasets):
            best_mean_good_datasets = i['mean_good_datasets']
            best_mean_good_datasets_k_number = i['k']
            
        if(i['mean_good_datasets'] > best_mode_good_datasets):
            best_mode_good_datasets = i['mode_good_datasets']
            best_mode_good_datasets_k_number = i['k']
        
    
        
    return {
        "results_each_hyperparameter": results,
        "results_according_k": results_according_k,
        "best_mean": best_mean_result,
        "best_mean_k_number": best_mean_k_number,
        "best_mode": best_mode_result,
        "best_mode_k_number": best_mode_k_number,
        "best_mean_good_datasets": best_mean_good_datasets,
        "best_mean_good_datasets_k_number": best_mean_good_datasets_k_number,
        "best_mode_good_datasets": best_mode_good_datasets,
        "best_mode_good_datasets_k_number": best_mode_good_datasets_k_number,
        "best_result_using_hyperparameter_directly": best_result_using_hyperparameter_directly
        
    }
#     print(results)

#     result = {
#         "results_each_hyperparameter": results,
#         "results_according_k"
#     }
    
    
#     return results
final_result = []

In [11]:

# 46-47-48 com problemas
for i in range(0, len(onlyfiles)):
    d1 = getDatasetInfo(onlyfiles[i])
    d1_mfe = getDatasetMfeInfo(onlyfiles[i])
    
    if(d1 == ''):
        continue
        
    print(i, ' - ', d1['name'])
                        
    if(d1):
        d = {
            "dataset": onlyfiles[i],
            "default_result": d1['default_params_result'],
#             'results_each_hyperparameter': getBestNHyperparams(onlyfiles[i], 10)
        }
        
#         print('Dataset: ', onlyfiles[i])
#         print('Resultado default: ', d1['default_params_result'])
#         print('\n')
        # O ultimo parametro é o N (N datasets mais semelhantes de acordo com as mfes)
        answer = getBestNHyperparams(onlyfiles[i], 10)
        d.update(answer)
        final_result.append(d)
#         print('---------------------')
        

# print(final_result)

# print('Quantidade de datasets otimizados pelo metalearning: ', optimized_datasets)
with open('final_result.json', 'w') as outfile:
    json.dump(final_result, outfile)

0  -  acute-inflammations_1556.csv
1  -  allbp_40707.csv
2  -  analcatdata_authorship_458.csv
3  -  analcatdata_challenger_1013.csv
4  -  analcatdata_chlamydia_875.csv
5  -  analcatdata_creditscore_461.csv
6  -  analcatdata_germangss_1025.csv
7  -  analcatdata_happiness_40709.csv
8  -  analcatdata_lawsuit_450.csv
9  -  analcatdata_vineyard_724.csv
10  -  analcatdata_wildcat_748.csv
11  -  ar1_1059.csv
12  -  ar4_1061.csv
13  -  ar6_1064.csv
14  -  autoPrice_756.csv
15  -  autoUniv-au6-400_1551.csv
16  -  autoUniv-au7-500_1554.csv
17  -  auto_price_745.csv
18  -  backache_463.csv
19  -  badges2_1121.csv
20  -  blogger_1463.csv
21  -  blood-transfusion-service-center_1464.csv
22  -  bodyfat_778.csv
23  -  breast-cancer-dropped-missing-attributes-values_23499.csv
24  -  breast-tissue_1465.csv
25  -  breast-tissue_1559.csv
26  -  calendarDOW_40663.csv
27  -  car-evaluation_40664.csv
28  -  cardiotocography_1466.csv
29  -  cars1_40700.csv
30  -  CastMetal1_1447.csv
31  -  chatfield_4_820.cs



52  -  ecoli_40671.csv
54  -  fertility_1473.csv
55  -  fri_c0_100_10_808.csv
56  -  fri_c0_100_25_889.csv
57  -  fri_c0_100_50_850.csv
58  -  fri_c0_100_5_754.csv
59  -  fri_c0_250_10_763.csv
60  -  fri_c0_250_25_773.csv
61  -  fri_c0_250_50_732.csv
62  -  fri_c0_250_5_776.csv
63  -  fri_c0_500_10_943.csv
64  -  fri_c0_500_25_926.csv
65  -  fri_c0_500_50_888.csv
66  -  fri_c0_500_5_884.csv
67  -  fri_c1_100_10_789.csv
68  -  fri_c1_100_25_812.csv
69  -  fri_c1_100_50_876.csv
70  -  fri_c1_100_5_829.csv
71  -  fri_c1_250_10_935.csv
72  -  fri_c1_250_25_746.csv
73  -  fri_c1_250_50_769.csv
74  -  fri_c1_250_5_730.csv
75  -  fri_c1_500_10_824.csv
76  -  fri_c1_500_25_779.csv
77  -  fri_c1_500_50_766.csv
78  -  fri_c1_500_5_870.csv
79  -  fri_c2_100_10_762.csv
80  -  fri_c2_100_25_775.csv
81  -  fri_c2_100_50_922.csv
82  -  fri_c2_100_5_726.csv
83  -  fri_c2_250_10_830.csv
84  -  fri_c2_250_25_794.csv
85  -  fri_c2_250_50_877.csv
86  -  fri_c2_250_5_911.csv
87  -  fri_c2_500_10_869.csv
88

In [12]:
# ULTIMA PARTE - Analisando os resultados
file = open('final_result.json')
d = json.load(file)

In [13]:
# Quantos datasets tiveram os resultados melhorados
# Em média, quantos Ks eram necessários para melhorar os resultados

print('Number of datasets: ', len(d))

improved_datasets_mean = 0
improved_datasets_mode = 0
improved_datasets_directly = 0

improved_datasets_mean_good = 0
improved_datasets_mode_good = 0

mean_count_according_k = []
mode_count_according_k = []
mean_good_count_according_k = []
mode_good_count_according_k = []

for i in range(0, len(d)):
    if(d[i]['best_mean'] > d[i]['default_result']):
        improved_datasets_mean += 1
    if(d[i]['best_mode'] > d[i]['default_result']):
        improved_datasets_mode += 1
    if(d[i]['best_result_using_hyperparameter_directly'] > d[i]['default_result']):
        improved_datasets_directly += 1
        
    if(d[i]['best_mean_good_datasets'] > d[i]['default_result']):
        improved_datasets_mean_good += 1
    if(d[i]['best_mode_good_datasets'] > d[i]['default_result']):
        improved_datasets_mode_good += 1
    
    for j in d[i]['results_according_k']:
        if(j['mean'] > d[i]['default_result']):
            mean_count_according_k.append(j['k'])
        if(j['mode'] > d[i]['default_result']):
            mode_count_according_k.append(j['k'])
        if(j['mean_good_datasets'] > d[i]['default_result']):
            mean_good_count_according_k.append(j['k'])
        if(j['mode_good_datasets'] > d[i]['default_result']):
            mode_good_count_according_k.append(j['k'])
            
        
        

print('Improved Datasets with Mean: ', improved_datasets_mean)
print('Improved Datasets with Mode: ', improved_datasets_mode)
print('Improved Datasets with mean using good datasets: ', improved_datasets_mean_good)
print('Improved Datasets with mode using good datasets: ', improved_datasets_mode_good)
print('Improved Datasets Directly: ', improved_datasets_directly)

Number of datasets:  198
Improved Datasets with Mean:  151
Improved Datasets with Mode:  141
Improved Datasets with mean using good datasets:  160
Improved Datasets with mode using good datasets:  153
Improved Datasets Directly:  160


In [14]:
# Deadline, calcular a média dos hiperparametros dentre os 200 e checar quantos foram melhorados com isso
file = open('result.json')
d = json.load(file)

hyperparam_deadline = {'max_depth': 0, 'min_child_weight': 0, 'gamma': 0, 'subsample': 0, 'colsample_bytree': 0}


print(d['datasets'][0]['params'])
for i in d['datasets']:
#     print(i['params'])
    hyperparam_deadline['max_depth'] += i['params']['max_depth']
    hyperparam_deadline['min_child_weight'] += i['params']['min_child_weight']
    hyperparam_deadline['gamma'] += i['params']['gamma']
    hyperparam_deadline['subsample'] += i['params']['subsample']
    hyperparam_deadline['colsample_bytree'] += i['params']['colsample_bytree']

hyperparam_deadline['max_depth'] = int(round(hyperparam_deadline['max_depth']/len(d['datasets'])))
hyperparam_deadline['min_child_weight'] /= len(d['datasets'])
hyperparam_deadline['gamma'] /= len(d['datasets'])
hyperparam_deadline['subsample'] /= len(d['datasets'])
hyperparam_deadline['colsample_bytree'] /= len(d['datasets'])

hyperparam_deadline['max_depth'] = [hyperparam_deadline['max_depth']]
hyperparam_deadline['min_child_weight'] = [hyperparam_deadline['min_child_weight']]
hyperparam_deadline['gamma'] = [hyperparam_deadline['gamma']]
hyperparam_deadline['subsample'] = [hyperparam_deadline['subsample']]
hyperparam_deadline['colsample_bytree'] = [hyperparam_deadline['colsample_bytree']]

optimized_datasets = 0

    
for i in range(0, len(onlyfiles)):
    if(i == 48):
        continue
    print(i, ' - ', onlyfiles[i])
    path = ('RealDatasets/' + onlyfiles[i])
    df = pd.read_csv(path)
    df.columns = [*df.columns[:-1], 'target']
    
    X = df[df.columns[:-1]]
    y = df['target']
    
    k_folds = 5 
    for j in df['target'].value_counts():
        if(j < k_folds):
            k_folds = j
            break
    
    answer = 0
    if(k_folds != 1):
        grid_search = GridSearchCV(
            estimator = estimator,
            param_grid = hyperparam_deadline,
            cv = k_folds
        )
        grid_search.fit(X, y)
        answer = grid_search.best_score_*100
    
        answer_default = xgboost(df)
    else:
        continue
    
    if(answer_default[0] > answer):
        optimized_datasets+=1
    print(answer_default[0], ' - ', answer)
    
print('Optimized Datasets: ', optimized_datasets)


{'max_depth': 3, 'min_child_weight': 1, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.2}
0  -  acute-inflammations_1556.csv
95.83333333333334  -  100.0
1  -  allbp_40707.csv
97.61399687318846  -  97.45488081226833
2  -  analcatdata_authorship_458.csv
96.79487179487178  -  97.38517892364045
3  -  analcatdata_challenger_1013.csv
86.34920634920634  -  93.49206349206348
4  -  analcatdata_chlamydia_875.csv
82.0  -  84.00000000000001
5  -  analcatdata_creditscore_461.csv
99.0  -  99.0
6  -  analcatdata_germangss_1025.csv
93.5  -  92.0
7  -  analcatdata_happiness_40709.csv
48.33333333333333  -  63.33333333333333
8  -  analcatdata_lawsuit_450.csv
95.4644412191582  -  94.70246734397678
9  -  analcatdata_vineyard_724.csv
53.427133379089454  -  62.182566918325335
10  -  analcatdata_wildcat_748.csv
75.41666666666667  -  75.98484848484848
11  -  ar1_1059.csv
93.36666666666666  -  91.73333333333333
12  -  ar4_1061.csv
84.11255411255411  -  83.20346320346322
13  -  ar6_1064.csv
85.1904761904761



0.0  -  0.0
47  -  datatrieve_1075.csv
86.92307692307693  -  90.76923076923077
49  -  diggle_table_a2_818.csv
100.0  -  100.0
50  -  ecoli_1011.csv
94.04302019315189  -  95.23266022827042
51  -  ecoli_39.csv
86.60714285714286  -  86.30952380952381
52  -  ecoli_40671.csv
84.70396270396272  -  85.93473193473194
53  -  Engine1_4340.csv
54  -  fertility_1473.csv
86.0  -  86.0
55  -  fri_c0_100_10_808.csv
85.0  -  82.0
56  -  fri_c0_100_25_889.csv
70.0  -  65.0
57  -  fri_c0_100_50_850.csv
70.0  -  65.00000000000001
58  -  fri_c0_100_5_754.csv
87.00000000000001  -  84.00000000000001
59  -  fri_c0_250_10_763.csv
82.4  -  84.80000000000001
60  -  fri_c0_250_25_773.csv
85.2  -  83.6
61  -  fri_c0_250_50_732.csv
80.0  -  79.60000000000001
62  -  fri_c0_250_5_776.csv
83.60000000000001  -  83.6
63  -  fri_c0_500_10_943.csv
86.8  -  87.20000000000002
64  -  fri_c0_500_25_926.csv
83.0  -  84.59999999999998
65  -  fri_c0_500_50_888.csv
84.00000000000001  -  86.0
66  -  fri_c0_500_5_884.csv
88.6  -  

88.84259259259257  -  88.0864197530864
189  -  vertebra-column_1524.csv
80.64516129032258  -  80.64516129032258
190  -  veteran_719.csv
61.32275132275132  -  64.92063492063492
191  -  vinnie_860.csv
79.21052631578948  -  80.52631578947368
192  -  visualizing_environmental_736.csv
58.61660079051383  -  63.16205533596838
193  -  visualizing_galaxy_925.csv
95.98076923076924  -  96.27884615384616
194  -  wholesale-customers_1511.csv
100.0  -  100.0
195  -  wine-quality-red_40691.csv
54.723354231974916  -  54.34659090909091
196  -  wine_187.csv
94.98412698412699  -  95.55555555555554
197  -  wine_973.csv
93.88888888888889  -  93.87301587301587
198  -  wisconsin_753.csv
46.41025641025641  -  54.64237516869096
199  -  yeast_181.csv
55.457047957047955  -  58.760806260806255
200  -  zoo_965.csv
97.0  -  99.0
Optimized Datasets:  90


In [15]:
# Deadline, calcular a moda dos hiperparametros dentre os 200 e checar quantos foram melhorados com isso
file = open('result_1.json')
d = json.load(file)

hyperparam_deadline = {'max_depth': [], 'min_child_weight': [], 'gamma': [], 'subsample': [], 'colsample_bytree': []}


print(d['datasets'][0]['params'])
for i in d['datasets']:
#     print(i['params'])
    hyperparam_deadline['max_depth'].append(i['params']['max_depth'])
    hyperparam_deadline['min_child_weight'].append(i['params']['min_child_weight'])
    hyperparam_deadline['gamma'].append(i['params']['gamma'])
    hyperparam_deadline['subsample'].append(i['params']['subsample'])
    hyperparam_deadline['colsample_bytree'].append(i['params']['colsample_bytree'])

    
hyperparam_deadline['max_depth'] = [max(set(hyperparam_deadline['max_depth']), key=hyperparam_deadline['max_depth'].count)]
hyperparam_deadline['min_child_weight'] = [max(set(hyperparam_deadline['min_child_weight']), key=hyperparam_deadline['min_child_weight'].count)]
hyperparam_deadline['gamma'] = [max(set(hyperparam_deadline['gamma']), key=hyperparam_deadline['gamma'].count)]
hyperparam_deadline['subsample'] = [max(set(hyperparam_deadline['subsample']), key=hyperparam_deadline['subsample'].count)]
hyperparam_deadline['colsample_bytree'] = [max(set(hyperparam_deadline['colsample_bytree']), key=hyperparam_deadline['colsample_bytree'].count)]

optimized_datasets = 0
    
for i in range(0, len(onlyfiles)):
    if(i == 48):
        continue
    print(i, ' - ', onlyfiles[i])
    path = ('RealDatasets/' + onlyfiles[i])
    df = pd.read_csv(path)
    df.columns = [*df.columns[:-1], 'target']
    
    X = df[df.columns[:-1]]
    y = df['target']
    
    k_folds = 5 
    for j in df['target'].value_counts():
        if(j < k_folds):
            k_folds = j
            break
    
    answer = 0
    if(k_folds != 1):
        grid_search = GridSearchCV(
            estimator = estimator,
            param_grid = hyperparam_deadline,
            cv = k_folds
        )
        grid_search.fit(X, y)
        answer = grid_search.best_score_*100
    
        answer_default = xgboost(df)
    else:
        continue
    
    if(answer_default[0] > answer):
        optimized_datasets+=1
    print(answer_default[0], ' - ', answer)
    
print('Optimized Datasets: ', optimized_datasets)

{'max_depth': 3, 'min_child_weight': 1, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.2}
0  -  acute-inflammations_1556.csv
95.83333333333334  -  95.83333333333334
1  -  allbp_40707.csv
97.61399687318846  -  97.5080366082878
2  -  analcatdata_authorship_458.csv
96.79487179487178  -  97.50563539025077
3  -  analcatdata_challenger_1013.csv
86.34920634920634  -  86.34920634920634
4  -  analcatdata_chlamydia_875.csv
82.0  -  81.0
5  -  analcatdata_creditscore_461.csv
99.0  -  99.0
6  -  analcatdata_germangss_1025.csv
93.5  -  94.0
7  -  analcatdata_happiness_40709.csv
48.33333333333333  -  46.666666666666664
8  -  analcatdata_lawsuit_450.csv
95.4644412191582  -  95.4644412191582
9  -  analcatdata_vineyard_724.csv
53.427133379089454  -  57.67787691603752
10  -  analcatdata_wildcat_748.csv
75.41666666666667  -  74.81060606060606
11  -  ar1_1059.csv
93.36666666666666  -  93.36666666666666
12  -  ar4_1061.csv
84.11255411255411  -  84.11255411255411
13  -  ar6_1064.csv
85.19047619047619  

KeyboardInterrupt: 

In [None]:
def frequency(a, x):
    count = 0
    for i in a:
        if(i == x):
            count += 1
    return count

def plot(array, title):
    b = [i for i in range(1, 11)]
    a = [0 for i in range(0, 10)]
    
    for i in range(0, 10):
        a[i] = frequency(array, i)
    plt.title(title)
    plt.bar(b, a)
    

plot(mean_count_according_k, 'Para cada K de 1 a 10, a frequencia de datasets que foram melhorados através da média dos hiperparâmetros')

In [None]:
plot(mode_count_according_k, 'Para cada K de 1 a 10, a frequencia de datasets que foram melhorados através da moda dos hiperparâmetros')

In [None]:
plot(mean_good_count_according_k, 'Para cada K de 1 a 10, a frequencia de datasets que foram melhorados através da média dos hiperparâmetros apenas utilizando datasets bons')

In [None]:
plot(mode_good_count_according_k, 'Para cada K de 1 a 10, a frequencia de datasets que foram melhorados através da moda dos hiperparâmetros apenas utilizando datasets bons')