In [1]:
import pandas as pd
import numpy as np
import math
import xgboost as xgb
import json

from scipy import spatial
from pymfe.mfe import MFE
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [5]:
dataset_answers = []
dataset_names = []
dataframes = []
meta_features_groups=['general']
json_result = {}
json_mfe_result = {}
json_mfe_result['datasets'] = []
json_result['datasets'] = []

isHyperparameterTest = 0

def mfe_extract(dataframe):
    y = dataframe['target'].tolist()
    X = dataframe.drop('target', axis=1).values
    
    mfe = MFE(meta_features_groups)
    mfe.fit(X, y)
    ft = mfe.extract()
    
    result = {}
    for i in range(0, len(ft[0])):
        if(str(ft[1][i]) != 'nan'):
#             result.append((ft[0][i], str(ft[1][i])))
            result[ft[0][i]] = np.float64(ft[1][i])
    
#     print('Result: ', result)
    #for i in range(len(ft[0])):
        #print(ft[0][i].ljust(30), ft[1][i])
        
    build_json_mfe_object(dataset_names[len(dataset_names) - 1], result)
    
    return result

def xgboost(dataframe):
    
    X = dataframe[dataframe.columns[:-1]]
    y = dataframe['target']
    
    estimator = xgb.XGBClassifier()    
    # Dado o dataframe, faz um grid search com os seguintes parâmetros e retorna
    # a melhor acurácia
    # https://xgboost.readthedocs.io/en/latest/parameter.html
#     parameters = {
#         'max_depth': range(1, 11, 1), # Maximum depth = more overfit
#         'min_child_weight': range(1, 11, 1), 
#         'gamma': [0, 0.5, 1, 1.5, 2, 5],
#         'subsample': [0.2, 0.4, 0.5, 0.7, 1.0],
#         'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1.0]
#     }

    # Parâmetros reduzidos
    parameters = {
        'max_depth': range(3, 8, 1), # Maximum depth = more overfit
        'min_child_weight': range(1, 3, 1), 
        'gamma': [0, 0.2, 0,5, 1],
        'subsample': [0.2, 0.5, 1.0],
        'colsample_bytree': [0.2, 0.5, 1.0]
    }
    
    default_parameters = {
        'max_depth': [6], 
        'min_child_weight': [1], 
        'gamma': [0],
        'subsample': [1],
        'colsample_bytree': [1]
    }
    
    X = dataframe[dataframe.columns[:-1]]
    y = dataframe['target']
    
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=default_parameters,
    )
    grid_search.fit(X, y)
    
    default_value = grid_search.best_score_*100
    
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=parameters,
    )
    grid_search.fit(X, y)
    
    build_json_object(dataset_names[len(dataset_names) - 1], default_value, grid_search.best_score_*100, grid_search.best_params_)
    #print(grid_search.best_estimator_)
    #print(grid_search.best_score_)
    # [melhores hiperparâmetros, acurácia]
    return (grid_search.best_estimator_, grid_search.best_score_*100)
    
def params_test(dataframe):
    # Todo: Testar mais hiperparametros
    results = []
    
#     results.append(xgboost(dataframe))


    return results

def checkIfDataNameExistsInJson(data_name):
    return False
    file = open('result.json')
    d = json.load(file)
    if('datasets' in d):
            for p in d['datasets']:
                if(p['name'] == data_name):
                    print("Já tem!")
                    return True
    return False
    

# O XgBoost não lida com colunas categoricas/object
# Este método serve para transformar as colunas categóricas em numéricas
def toNumericalColumn(df):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].astype('category')
#     df.dtypes
    df[cat_columns] = df[cat_columns].apply(lambda x : x.cat.codes)
    return df

file = open('result.json')
data = json.load(file)
    
def getDatasetInfo(name):
    d = ''
    for p in data['datasets']:
        if(p['name'] == name):
            d = p
    return d

def testHyperparams(dataframe, name):
    d1 = getDatasetInfo(name)
    print('Resultado como os hiperparâmetros default para ', name, ':')
    print(d1['default_params_result'])
    
    for p in data['datasets']:
        if(p['name'] == name):
            continue
        d2 = p
        
        
    
#     parameters = data_['params']
#     grid_search = GridSearchCV(
#         estimator=estimator,
#         param_grid=default_parameters,
#     )
#     grid_search.fit(X, y)
#     answer = grid_search.best_score_*100

def solve(data, data_name):
    if(checkIfDataNameExistsInJson(data_name)):
        return ;
    
    dataframe = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))
    dataset_names.append(data_name)
    mfe_list = mfe_extract(dataframe)
    
    dataframes.append(dataframe)
    
#     if(isHyperparameterTest):
#         testHyperparams(data, data_name)
#     else:
    dataset_answers.append((params_test(dataframe), mfe_list))

In [6]:
# Datasets kaggle
def solveDiabetesDataset():
    if(checkIfDataNameExistsInJson('Diabetes')):
        return ;
    df_diabetes = pd.read_csv('DatasetsMestrado/diabetes.csv')
    df_diabetes = df_diabetes.rename(columns={"Outcome": "target"})
    
    dataset_names.append('Diabetes')
    dataframes.append(df_diabetes)
    
    mfe_list = mfe_extract(df_diabetes)
    dataset_answers.append((params_test(df_diabetes), mfe_list))

def solveBankNoteAuthenticationDataset():
    if(checkIfDataNameExistsInJson('BankNote')):
        return ;
    df_bank = pd.read_csv('DatasetsMestrado/BankNote_Authentication.csv')
    df_bank = df_bank.rename(columns={'class': 'target'})
    dataset_names.append('BankNote')
    dataframes.append(df_bank)
    
    mfe_list = mfe_extract(df_bank)
    dataset_answers.append((params_test(df_bank), mfe_list))
    
def solveGlassClassification():
    if(checkIfDataNameExistsInJson('Glass')):
        return ;
    df_glass = pd.read_csv('DatasetsMestrado/glass.csv')
    df_glass = df_glass.rename(columns={'Type': 'target'})
    
    dataset_names.append('Glass')
    dataframes.append(df_glass)
    mfe_list = mfe_extract(df_glass)
    dataset_answers.append((params_test(df_glass), mfe_list))
    
def solveIndianLiverPatientClassification():
    df_indian = pd.read_csv('DatasetsMestrado/indian_liver_patient.csv')
    df_indian = df_indian.rename(columns={'Dataset': 'target'})
    dataset_names.append('IndianLiver')
    
    gender = {'Male': 1, 'Female': 0}
    df_indian.replace(gender, inplace=True)
    
    dataframes.append(df_indian)
    
    mfe_list = mfe_extract(df_indian)
    dataset_answers.append((params_test(df_indian), mfe_list))
    
def solveObesityClassification():
    #https://archive.ics.uci.edu/ml/datasets/Estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition+
    df_obesity = pd.read_csv('DatasetsMestrado/Obesity.csv')
    df_obesity = df_obesity.rename(columns={'NObeyesdad': 'target'})
    dataset_names.append('ObesityLevelBasedOnEatingHabits')
    
    # Pegando as meta features antes de transformar as colunas para numéricas
    mfe_list = mfe_extract(df_obesity)
    
    df_obesity = toNumericalColumn(df_obesity)
    dataframes.append(df_obesity)
    dataset_answers.append((params_test(df_obesity), mfe_list))
    
def solveWebsitePhishingClassification():
    df_website = pd.read_csv('DatasetsMestrado/WebsitePhishing.csv')
    df_website = df_website.rename(columns={'Result': 'target'})
    dataset_names.append('WebsitePhishing')
    
    mfe_list = mfe_extract(df_website)
    dataframes.append(df_website)
    
    dataset_answers.append((params_test(df_website), mfe_list))
    
def solveMuskDatasetClassification():
    df_musk = pd.read_csv('DatasetsMestrado/MuskDataset.csv')
    df_musk = df_musk.rename(columns={'class': 'target'})
    dataset_names.append('Musk')
    
    mfe_list = mfe_extract(df_musk)
    
    df_musk = toNumericalColumn(df_musk)
    dataframes.append(df_musk)
    
    dataset_answers.append((params_test(df_musk), mfe_list))
    
def solvePenbasedClassification():
    df = pd.read_csv('DatasetsMestrado/Penbased.csv')
    df = df.rename(columns={' 1': 'target'})
    dataset_names.append('Penbased')
    
    mfe_list = mfe_extract(df)
    dataframes.append(df)
    
    dataset_answers.append((params_test(df), mfe_list))
    
def solveVehiclesSilhouetteClassification():
    df = pd.read_csv('DatasetsMestrado/VehiclesSilhouette.csv')
    df = df.rename(columns={'class': 'target'})
    dataset_names.append('VehiclesSilhouette')
    
    mfe_list = mfe_extract(df)
    dataframes.append(df)
    
    dataset_answers.append((params_test(df), mfe_list))
    
def solveBankMarketingClassification():
    df = pd.read_csv('DatasetsMestrado/BankMarketing.csv')
    df = df.rename(columns={'deposit': 'target'})
    dataset_names.append('BankMarketing')
    
    mfe_list = mfe_extract(df)
    
    df = toNumericalColumn(df)
    dataframes.append(df)
    dataset_answers.append((params_test(df), mfe_list))
    
def solveDevIndexClassification():
    df = pd.read_csv('DatasetsMestrado/DevIndex.csv')
    df = df.rename(columns={'Development Index': 'target'})
    dataset_names.append('DevIndex')
    
    mfe_list = mfe_extract(df)
    dataframes.append(df)
    dataset_answers.append((params_test(df), mfe_list))

def solveTravelInsuranceClasification():
    df = pd.read_csv('DatasetsMestrado/TravelInsurance.csv')
    df = df.rename(columns={'Claim': 'target'})
    dataset_names.append('TravelInsurance')
    col = df.pop('target')
    df.pop('Gender')
    df['target'] = col
    
    mfe_list = mfe_extract(df)
    
    df = toNumericalColumn(df)
    dataframes.append(df)
    dataset_answers.append((params_test(df), mfe_list))

def solveIncomeEvaluationClassification():
    df = pd.read_csv('DatasetsMestrado/income_evaluation.csv')
    df = df.rename(columns={' income': 'target'})
    dataset_names.append('IncomeEvaluation')
    
    mfe_list = mfe_extract(df)
    
    df = toNumericalColumn(df)
    dataframes.append(df)
    dataset_answers.append((params_test(df), mfe_list))


In [7]:
def build_json_object(dataset_name, default_params_result, best_params_result, params):
    json_result['datasets'].append({
        'name': dataset_name,
        'default_params_result': default_params_result,
        'best_params_result': best_params_result,
        'params': {
            'max_depth': params['max_depth'],
            'min_child_weight': params['min_child_weight'],
            'gamma': params['gamma'],
            'subsample': params['subsample'],
            'colsample_bytree': params['colsample_bytree']
        }
    })

def build_json_mfe_object(dataset_name, mfe):
    print('dataset name: ', dataset_name)
    print('mfe: ', mfe)
    json_mfe_result['datasets'].append({
        'name': dataset_name,
        'mfe': mfe
    })

In [8]:
# Aqui serão injetados mais datasets
solve(load_iris(), 'Iris')

dataset name:  Iris
mfe:  {'attr_conc.mean': 0.20980476831180148, 'attr_conc.sd': 0.1195879817732128, 'attr_ent.mean': 2.2771912775084115, 'attr_ent.sd': 0.06103943244855649, 'attr_to_inst': 0.02666666666666667, 'cat_to_num': 0.0, 'class_conc.mean': 0.27347384133126745, 'class_conc.sd': 0.14091096327223987, 'class_ent': 1.584962500721156, 'eq_num_attr': 1.8780672345507194, 'freq_class.mean': 0.3333333333333333, 'freq_class.sd': 0.0, 'inst_to_attr': 37.5, 'joint_ent.mean': 3.0182209990602855, 'joint_ent.sd': 0.3821875549207214, 'mut_inf.mean': 0.8439327791692818, 'mut_inf.sd': 0.4222019352579773, 'nr_attr': 4.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 3.0, 'nr_inst': 150.0, 'nr_num': 4.0, 'ns_ratio': 1.698308838945616}


In [9]:
solve(load_wine(), 'Wine')

dataset name:  Wine
mfe:  {'attr_conc.mean': 0.07957508423380168, 'attr_conc.sd': 0.05203653464617564, 'attr_ent.mean': 2.317270671174068, 'attr_ent.sd': 0.008810632085191748, 'attr_to_inst': 0.07303370786516854, 'cat_to_num': 0.0, 'class_conc.mean': 0.1528450520627143, 'class_conc.sd': 0.07145112425991589, 'class_ent': 1.5668222768551812, 'eq_num_attr': 3.182758001633378, 'freq_class.mean': 0.3333333333333333, 'freq_class.sd': 0.06462709335856702, 'inst_to_attr': 13.692307692307692, 'joint_ent.mean': 3.391808496603444, 'joint_ent.sd': 0.2331019435755677, 'mut_inf.mean': 0.49228445142580574, 'mut_inf.sd': 0.23706149243295327, 'nr_attr': 13.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 3.0, 'nr_inst': 178.0, 'nr_num': 13.0, 'ns_ratio': 3.707178267488535}


In [10]:
solve(load_breast_cancer(), 'Breast Cancer')

dataset name:  Breast Cancer
mfe:  {'attr_conc.mean': 0.12540241009394565, 'attr_conc.sd': 0.15743690730463034, 'attr_ent.mean': 2.999955970180525, 'attr_ent.sd': 4.8835063889264575e-05, 'attr_to_inst': 0.05272407732864675, 'cat_to_num': 0.0, 'class_conc.mean': 0.05158464086252799, 'class_conc.sd': 0.03583982015886094, 'class_ent': 0.9526351224018599, 'eq_num_attr': 3.1052455419256693, 'freq_class.mean': 0.5, 'freq_class.sd': 0.18019417095263515, 'inst_to_attr': 18.966666666666665, 'joint_ent.mean': 3.6458085500942636, 'joint_ent.sd': 0.22568425948582174, 'mut_inf.mean': 0.306782542488121, 'mut_inf.sd': 0.22569828738894582, 'nr_attr': 30.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 2.0, 'nr_inst': 569.0, 'nr_num': 30.0, 'ns_ratio': 8.778770153770036}


In [11]:
solveDiabetesDataset()



dataset name:  Diabetes
mfe:  {'attr_conc.mean': 0.028840438796475666, 'attr_conc.sd': 0.03211053699372291, 'attr_ent.mean': 2.958105757854844, 'attr_ent.sd': 0.34168940624694744, 'attr_to_inst': 0.010416666666666666, 'cat_to_num': 0.0, 'class_conc.mean': 0.01248981695807782, 'class_conc.sd': 0.008523298017724572, 'class_ent': 0.9331343166407832, 'eq_num_attr': 12.954877983803843, 'freq_class.mean': 0.5, 'freq_class.sd': 0.21360517348343622, 'inst_to_attr': 96.0, 'joint_ent.mean': 3.8192105024836493, 'joint_ent.sd': 0.3357632924270967, 'mut_inf.mean': 0.07202957201197768, 'mut_inf.sd': 0.05160877219809383, 'nr_attr': 8.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 2.0, 'nr_inst': 768.0, 'nr_num': 8.0, 'ns_ratio': 40.06793467220582}


In [12]:
solveBankNoteAuthenticationDataset()

dataset name:  BankNote
mfe:  {'attr_conc.mean': 0.07073367549921593, 'attr_conc.sd': 0.03808910374616643, 'attr_ent.mean': 3.4594097747073844, 'attr_ent.sd': 1.6114746737255955e-05, 'attr_to_inst': 0.0029154518950437317, 'cat_to_num': 0.0, 'class_conc.mean': 0.024286779730748016, 'class_conc.sd': 0.024028284464771867, 'class_ent': 0.9911281257467459, 'eq_num_attr': 4.650453395733756, 'freq_class.mean': 0.5, 'freq_class.sd': 0.07833836059792652, 'inst_to_attr': 343.0, 'joint_ent.mean': 4.2374128476031565, 'joint_ent.sd': 0.22353036055417838, 'mut_inf.mean': 0.2131250528509735, 'mut_inf.sd': 0.22354371307580517, 'nr_attr': 4.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 2.0, 'nr_inst': 1372.0, 'nr_num': 4.0, 'ns_ratio': 15.231830694847298}


In [13]:
solveGlassClassification()



dataset name:  Glass
mfe:  {'attr_conc.mean': 0.08646069983674642, 'attr_conc.sd': 0.06958254586990041, 'attr_ent.mean': 2.0168573913192267, 'attr_ent.sd': 0.620017868913917, 'attr_to_inst': 0.04205607476635514, 'cat_to_num': 0.0, 'class_conc.mean': 0.17215668455304822, 'class_conc.sd': 0.1535834270241933, 'class_ent': 2.1765339923982006, 'eq_num_attr': 6.175966711718482, 'freq_class.mean': 0.16666666666666663, 'freq_class.sd': 0.13898760091092968, 'inst_to_attr': 23.77777777777778, 'joint_ent.mean': 3.840971415476929, 'joint_ent.sd': 0.5697344535835459, 'mut_inf.mean': 0.35241996824049804, 'mut_inf.sd': 0.14905199270121083, 'nr_attr': 9.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 6.0, 'nr_inst': 214.0, 'nr_num': 9.0, 'ns_ratio': 4.722880577365256}


In [14]:
solveIndianLiverPatientClassification()



dataset name:  IndianLiver
mfe:  {'attr_conc.mean': 0.08824324010922074, 'attr_conc.sd': 0.1642124724453355, 'attr_ent.mean': 2.381757029490394, 'attr_ent.sd': 1.0847537410875903, 'attr_to_inst': 0.017152658662092625, 'cat_to_num': 0.0, 'class_conc.mean': 0.061421784244719856, 'class_conc.sd': 0.15432932072387165, 'class_ent': 0.8640904272487623, 'eq_num_attr': 17.118946878030346, 'freq_class.mean': 0.5, 'freq_class.sd': 0.30200615525806235, 'inst_to_attr': 58.3, 'joint_ent.mean': 3.1953717800865413, 'joint_ent.sd': 1.0639848907591336, 'mut_inf.mean': 0.050475676652615556, 'mut_inf.sd': 0.03904433027635108, 'nr_attr': 10.0, 'nr_bin': 1.0, 'nr_cat': 0.0, 'nr_class': 2.0, 'nr_inst': 583.0, 'nr_num': 10.0, 'ns_ratio': 46.18623280440909}


In [15]:
solveObesityClassification() 



dataset name:  ObesityLevelBasedOnEatingHabits
mfe:  {'attr_conc.mean': 0.03275971679115075, 'attr_conc.sd': 0.05579684348292799, 'attr_ent.mean': 1.8635898668556545, 'attr_ent.sd': 1.3122305874684492, 'attr_to_inst': 0.007579346281383231, 'cat_to_num': 1.0, 'class_conc.mean': 0.12463234319268883, 'class_conc.sd': 0.0918079889589204, 'class_ent': 2.802569467267562, 'eq_num_attr': 8.471445721548022, 'freq_class.mean': 0.14285714285714285, 'freq_class.sd': 0.01271347189640241, 'inst_to_attr': 131.9375, 'joint_ent.mean': 4.335333928473868, 'joint_ent.sd': 1.1225332382326942, 'mut_inf.mean': 0.3308254056493485, 'mut_inf.sd': 0.3366486378491083, 'nr_attr': 16.0, 'nr_bin': 5.0, 'nr_cat': 8.0, 'nr_class': 7.0, 'nr_inst': 2111.0, 'nr_num': 8.0, 'ns_ratio': 4.633152215736804, 'num_to_cat': 1.0}


In [16]:
solveWebsitePhishingClassification()



dataset name:  WebsitePhishing
mfe:  {'attr_conc.mean': 0.04381278290883977, 'attr_conc.sd': 0.11731843631950295, 'attr_ent.mean': 0.943650944548291, 'attr_ent.sd': 0.7495535228418994, 'attr_to_inst': 0.002804161013116237, 'cat_to_num': 0.0, 'class_conc.mean': 0.05596835509741612, 'class_conc.sd': 0.11972100376898279, 'class_ent': 0.9906239227414301, 'eq_num_attr': 19.6717107240766, 'freq_class.mean': 0.5, 'freq_class.sd': 0.08052894052590348, 'inst_to_attr': 356.61290322580646, 'joint_ent.mean': 1.8839170749941592, 'joint_ent.sd': 0.7338870473115354, 'mut_inf.mean': 0.05035779229556205, 'mut_inf.sd': 0.12202448954841856, 'nr_attr': 31.0, 'nr_bin': 22.0, 'nr_cat': 0.0, 'nr_class': 2.0, 'nr_inst': 11055.0, 'nr_num': 31.0, 'ns_ratio': 17.73892602379738}


In [17]:
solveMuskDatasetClassification() 



dataset name:  Musk
mfe:  {'attr_conc.mean': 0.08309144518185785, 'attr_conc.sd': 0.045152766286091314, 'attr_ent.mean': 4.0942926653441845, 'attr_ent.sd': 0.7713418729149654, 'attr_to_inst': 0.025613822370415276, 'cat_to_num': 0.011976047904191617, 'class_conc.mean': 0.0061677917834434765, 'class_conc.sd': 0.005724258538808894, 'class_ent': 0.6200985590130879, 'eq_num_attr': 8.684983449144633, 'freq_class.mean': 0.5, 'freq_class.sd': 0.489123272102971, 'inst_to_attr': 39.0414201183432, 'joint_ent.mean': 4.6429922904797225, 'joint_ent.sd': 0.7341785458402638, 'mut_inf.mean': 0.07139893387754932, 'mut_inf.sd': 0.07716648471851716, 'nr_attr': 169.0, 'nr_bin': 0.0, 'nr_cat': 2.0, 'nr_class': 2.0, 'nr_inst': 6598.0, 'nr_num': 167.0, 'ns_ratio': 56.34389076965746, 'num_to_cat': 83.5}


In [18]:
solvePenbasedClassification()



dataset name:  Penbased
mfe:  {'attr_conc.mean': 0.028291618377972727, 'attr_conc.sd': 0.02377735270906875, 'attr_ent.mean': 3.956291457025694, 'attr_ent.sd': 0.4153336358849079, 'attr_to_inst': 0.001455736511691384, 'cat_to_num': 0.0, 'class_conc.mean': 0.09661180583828607, 'class_conc.sd': 0.0532381704339068, 'class_ent': 3.3207767110933735, 'eq_num_attr': 5.20903766418788, 'freq_class.mean': 0.1, 'freq_class.sd': 0.004210756566990803, 'inst_to_attr': 686.9375, 'joint_ent.mean': 6.639565250080347, 'joint_ent.sd': 0.5100132727173804, 'mut_inf.mean': 0.6375029180387204, 'mut_inf.sd': 0.20676087016868827, 'nr_attr': 16.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 10.0, 'nr_inst': 10991.0, 'nr_num': 16.0, 'ns_ratio': 5.2059189771197225}


In [19]:
solveVehiclesSilhouetteClassification()



dataset name:  VehiclesSilhouette
mfe:  {'attr_conc.mean': 0.3778830028973923, 'attr_conc.sd': 0.21268418452012988, 'attr_ent.mean': 0.6744729797701781, 'attr_ent.sd': 1.3020160572328283, 'attr_to_inst': 0.02127659574468085, 'cat_to_num': 0.0, 'class_conc.mean': 0.39726919329925975, 'class_conc.sd': 0.19788315474925042, 'class_ent': 1.4920242566934, 'eq_num_attr': 34.11930434053296, 'freq_class.mean': 0.3333333333333333, 'freq_class.sd': 0.15089799515442506, 'inst_to_attr': 47.0, 'joint_ent.mean': 2.122767615196919, 'joint_ent.sd': 1.219740371437467, 'mut_inf.mean': 0.04372962126665955, 'mut_inf.sd': 0.08701792588693116, 'nr_attr': 18.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 3.0, 'nr_inst': 846.0, 'nr_num': 18.0, 'ns_ratio': 14.42370960995291}


In [20]:
solveBankMarketingClassification() 



dataset name:  BankMarketing
mfe:  {'attr_conc.mean': 0.02037326611908932, 'attr_conc.sd': 0.07165677663497294, 'attr_ent.mean': 2.2438593126161313, 'attr_ent.sd': 1.5127765077719693, 'attr_to_inst': 0.0014334348683031715, 'cat_to_num': 1.2857142857142858, 'class_conc.mean': 0.015645053934445347, 'class_conc.sd': 0.01641663279613026, 'class_ent': 0.998024465973655, 'eq_num_attr': 23.27589831917426, 'freq_class.mean': 0.5, 'freq_class.sd': 0.03699609032547424, 'inst_to_attr': 697.625, 'joint_ent.mean': 3.199005757200962, 'joint_ent.sd': 1.49241480596778, 'mut_inf.mean': 0.042878021388824364, 'mut_inf.sd': 0.05574221229375209, 'nr_attr': 16.0, 'nr_bin': 3.0, 'nr_cat': 9.0, 'nr_class': 2.0, 'nr_inst': 11162.0, 'nr_num': 7.0, 'ns_ratio': 51.33122331528492, 'num_to_cat': 0.7777777777777778}


In [21]:
solveDevIndexClassification() 

dataset name:  DevIndex
mfe:  {'attr_conc.mean': 0.08858722660460103, 'attr_conc.sd': 0.07998467446651997, 'attr_ent.mean': 2.5775844504542182, 'attr_ent.sd': 0.017175074562126205, 'attr_to_inst': 0.02666666666666667, 'cat_to_num': 0.0, 'class_conc.mean': 0.13739178098944485, 'class_conc.sd': 0.136764466300644, 'class_ent': 1.7804438466836467, 'eq_num_attr': 3.4762819381468155, 'freq_class.mean': 0.25, 'freq_class.sd': 0.14467218513682364, 'inst_to_attr': 37.5, 'joint_ent.mean': 3.8458592963597553, 'joint_ent.sd': 0.5016937026173055, 'mut_inf.mean': 0.5121690007781102, 'mut_inf.sd': 0.5017024568949772, 'nr_attr': 6.0, 'nr_bin': 0.0, 'nr_cat': 0.0, 'nr_class': 4.0, 'nr_inst': 225.0, 'nr_num': 6.0, 'ns_ratio': 4.032683443430266}


In [22]:
solveTravelInsuranceClasification() 



dataset name:  TravelInsurance
mfe:  {'attr_conc.mean': 0.19041589623804792, 'attr_conc.sd': 0.25802247975279385, 'attr_ent.mean': 3.0198553292004298, 'attr_ent.sd': 1.760756998239279, 'attr_to_inst': 0.00014212171935697816, 'cat_to_num': 1.25, 'class_conc.mean': 0.003054362581661018, 'class_conc.sd': 0.0034382111582600796, 'class_ent': 0.1101720962924944, 'eq_num_attr': 13.763692764883277, 'freq_class.mean': 0.5, 'freq_class.sd': 0.6864047634952382, 'inst_to_attr': 7036.222222222223, 'joint_ent.mean': 3.1220228806244426, 'joint_ent.sd': 1.7594213627747446, 'mut_inf.mean': 0.00800454486848092, 'mut_inf.sd': 0.005083349779529738, 'nr_attr': 9.0, 'nr_bin': 2.0, 'nr_cat': 5.0, 'nr_class': 2.0, 'nr_inst': 63326.0, 'nr_num': 4.0, 'ns_ratio': 376.267587204309, 'num_to_cat': 0.8}


In [23]:
solveIncomeEvaluationClassification()



dataset name:  IncomeEvaluation
mfe:  {'attr_conc.mean': 0.04830237682447616, 'attr_conc.sd': 0.1395481952339741, 'attr_ent.mean': 2.2197528515718576, 'attr_ent.sd': 1.5139967849269165, 'attr_to_inst': 0.00042996222474739717, 'cat_to_num': 1.3333333333333333, 'class_conc.mean': 0.02703239097275378, 'class_conc.sd': 0.03320210419262275, 'class_ent': 0.7963839552022132, 'eq_num_attr': 12.006187880013178, 'freq_class.mean': 0.5, 'freq_class.sd': 0.3665506390973169, 'inst_to_attr': 2325.785714285714, 'joint_ent.mean': 2.9498056812610374, 'joint_ent.sd': 1.5033030990636995, 'mut_inf.mean': 0.0663311255130333, 'mut_inf.sd': 0.05319818541302973, 'nr_attr': 14.0, 'nr_bin': 1.0, 'nr_cat': 8.0, 'nr_class': 2.0, 'nr_inst': 32561.0, 'nr_num': 6.0, 'ns_ratio': 32.464724658346135, 'num_to_cat': 0.75}


In [21]:
# with open('result.json', 'w') as outfile:
#     json.dump(json_result, outfile)

# with open('mfe_result.json', 'w') as outfile:
#     json.dump(json_mfe_result, outfile)

In [24]:
def euclidean(v1, v2):
    return sum((p-q)**2 for p, q in zip(v1, v2)) ** .5

def calculates_metafeature_similarity(listA, listB):
    arrayAuxA = []
    arrayAuxB = []
        
    for i in listA:
        for j in listB:
            if(i == j):
                arrayAuxA.append(listA[i])
                arrayAuxB.append(listB[j])
                
    #print('Resultado Array A'.rjust(57), 'Resultado Array B'.rjust(40))
#     for i in range(0, len(listA)):
#         for j in range(0, len(listB)):
#             if(math.isnan(listA[1][i]) == True or math.isnan(listB[1][j]) == True):
#                 continue
                
#             if(listA[0][i] == listB[0][j]):
#                 #print('%s %s %s' % (listA[0][i].ljust(40), str(listA[1][i]).ljust(40), str(listB[1][j])))
#                 arrayAuxA.append(listA[1][i])
#                 arrayAuxB.append(listB[1][j])
    
    #return (1 - spatial.distance.cosine(arrayAuxA, arrayAuxB))
    return euclidean(arrayAuxA, arrayAuxB)

In [25]:
for i in range(0, len(dataset_names)):
    print("Resultado para", dataset_names[0], "e", dataset_names[i])
    print(calculates_metafeature_similarity(dataset_answers[0][1], dataset_answers[i][1]))

Resultado para Iris e Iris
0.0
Resultado para Iris e Wine
38.97326781068882
Resultado para Iris e Breast Cancer
421.0830375962489
Resultado para Iris e Diabetes
622.0745275557308
Resultado para Iris e BankNote
1259.6864417561487
Resultado para Iris e Glass
66.12679717280945
Resultado para Iris e IndianLiver
436.1308656552664
Resultado para Iris e ObesityLevelBasedOnEatingHabits
1963.355137703559
Resultado para Iris e WebsitePhishing
10909.783673663162
Resultado para Iris e Musk
6452.40625233598
Resultado para Iris e Penbased
10860.452558652647
Resultado para Iris e VehiclesSilhouette
697.2131201102142
Resultado para Iris e BankMarketing
11031.911919808694
Resultado para Iris e DevIndex
75.11998598680881
Resultado para Iris e TravelInsurance
63563.587042628606
Resultado para Iris e IncomeEvaluation
32491.69737701376


In [32]:
file = open('result.json')
file_mfe = open('mfe_result.json')
data = json.load(file)
estimator = xgb.XGBClassifier()    

def testHyperparams(dataframe, data):
    params = data['params']
    
#     print('Params: ', params)
    
    X = dataframe[dataframe.columns[:-1]]
    y = dataframe['target']
    
    for i in params:
        if(type(params[i]) != type([])):
            params[i] = [(params[i])]
    
    grid_search = GridSearchCV(
        estimator = estimator,
        param_grid = params
    )
    grid_search.fit(X, y)
    answer = grid_search.best_score_*100
    
    return answer

def getDatasetInfo(name):
    d = ''
    for p in data['datasets']:
        if(p['name'] == name):
            d = p
    
    return d
# for p in data['datasets']:
#     if(p['name'] == 'Iris'):
#         t1 = p

for i in range(0, len(dataset_names)):
    d1 = getDatasetInfo(dataset_names[i])
    print('Dataset: ', dataset_names[i])
    print('Resultado default: ', d1['default_params_result'])
    respostas = []
    for j in range(0, len(dataset_names)):
        if(j == i):
            continue
#         print('De ', dataset_names[i], 'com ', dataset_names[j])
#         print('Distância: ', calculates_metafeature_similarity(dataset_answers[i][1], dataset_answers[j][1]))
        d2 = getDatasetInfo(dataset_names[j])
#         print('Resultado utilizando os hiperparâmetros: ', testHyperparams(dataframes[i], d2))
        # Valor utilizando os hiperparametros / nome do dataset usado / similaridade com ele
#         respostas.append((testHyperparams(dataframes[i], d2), dataset_names[j], calculates_metafeature_similarity(dataset_answers[i][1], dataset_answers[j][1])))
        respostas.append((calculates_metafeature_similarity(dataset_answers[i][1], dataset_answers[j][1]), dataset_names[j], testHyperparams(dataframes[i], d2)))
#         print(dataset_answers[i])
#         print('meta_features: ', dataset_answers[i][1])
#         print(calculates_metafeature_similarity(dataset_answers[i][1], dataset_answers[j][1]))
    
    respostas.sort()
    for j in respostas:
        print(j)
    print("----------")

Dataset:  Iris
Resultado default:  96.0
(38.97326781068882, 'Wine', 96.0)
(66.12679717280945, 'Glass', 96.66666666666669)
(75.11998598680881, 'DevIndex', 96.0)
(421.0830375962489, 'Breast Cancer', 96.0)
(436.1308656552664, 'IndianLiver', 95.33333333333334)
(622.0745275557308, 'Diabetes', 94.66666666666666)
(697.2131201102142, 'VehiclesSilhouette', 96.0)
(1259.6864417561487, 'BankNote', 96.0)
(1963.355137703559, 'ObesityLevelBasedOnEatingHabits', 96.0)
(6452.40625233598, 'Musk', 96.0)
(10860.452558652647, 'Penbased', 96.0)
(10909.783673663162, 'WebsitePhishing', 96.66666666666669)
(11031.911919808694, 'BankMarketing', 94.66666666666666)
(32491.69737701376, 'IncomeEvaluation', 95.33333333333334)
(63563.587042628606, 'TravelInsurance', 94.66666666666666)
----------
Dataset:  Wine
Resultado default:  94.98412698412699
(38.0779115043851, 'Glass', 96.09523809523812)
(38.97326781068882, 'Iris', 96.09523809523809)
(53.62351529651516, 'DevIndex', 97.77777777777779)
(391.8093345815493, 'Breast C

(179.07739428427595, 'BankMarketing', 95.52349670066758)
(338.54453303395223, 'WebsitePhishing', 96.54255946432761)
(4446.024710651049, 'Musk', 97.3432253307298)
(8897.337254739761, 'ObesityLevelBasedOnEatingHabits', 97.72536197071959)
(9625.171109535931, 'BankNote', 97.3432253307298)
(10165.213194695594, 'VehiclesSilhouette', 97.53429571966082)
(10240.137632443688, 'Diabetes', 95.83289368440695)
(10427.062344056452, 'IndianLiver', 97.25223351999276)
(10443.407408356017, 'Breast Cancer', 97.94372576499948)
(10785.581942722407, 'DevIndex', 97.53429158178857)
(10797.390335810756, 'Glass', 97.57978748715708)
(10833.942868861523, 'Wine', 97.53429158178857)
(10860.452558652647, 'Iris', 97.35231623605898)
(21632.192985161622, 'IncomeEvaluation', 97.50701472792869)
(52720.05042746071, 'TravelInsurance', 95.68729023573873)
----------
Dataset:  VehiclesSilhouette
Resultado default:  96.57361642882003
(99.00525416554781, 'Diabetes', 92.79081099895579)
(265.94989190105656, 'IndianLiver', 96.21789