# Creating Examples as datasets

In [1]:
!mkdir bases

mkdir: cannot create directory ‘bases’: File exists


In [20]:
from sklearn.datasets import make_blobs
import pandas as pd
for i in range(2):
    i+=1
    X, y = make_blobs(n_samples=1000, centers=5, n_features=3,cluster_std=2)
    X = pd.DataFrame(X)
    
    # Define the mapping of values to replacements
    replacement_mapping = {
        0: 'A',
        1: 'B',
        2: 'C',
        3: 'D',
        4: 'E'
    }

    # Replace values in the list
    y = [replacement_mapping.get(value, value) for value in y]
    X['string'] = [replacement_mapping[i%3] for i in range(len(X))]
    X['class'] = y
    X.to_csv(f'bases/df{i}.csv',index=False,decimal=',',sep=';')

In [28]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from experimenter import ModelRunner

# Configurando

In [29]:
runner = ModelRunner()


def custom_reader(filename):
    data = pd.read_csv(filename,decimal=',',sep=';')
    return data


#Função personalizada para leitura do dataset
runner.data.reader = custom_reader

#Bases

runner.data.add_datasets(['bases/', #Path to folder
                          'bases/df1.csv', #Path to file
                          (X,'X'), #Dataframe with name
                          X #Dataframe without name
                          ])

#Splits
runner.splits.add_holdout([.3]) #Holdouts
runner.splits.add_fold([3]) #Kfolds

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Apply standard normalization
    ('bagging',  BaggingClassifier(estimator=knn()))
])

#Modelos
runner.models.add_models(['DecisionTreeClassifier', 
                         BaggingClassifier(estimator=knn()),
                         pipeline]
                         )

#Métricas
runner.metrics.add_score(['matrix',
                          'acc',
                          'f1_macro'] #Equivalente a  f1_score(y_true, y_pred, average='micro'), f1_score só funciona para binários
                          )

#Seeds
runner.random.add_model_seed([42])
#runner.random.add_model_seed([42,43])
#runner.random.add_split_seed([10,12])

runner.summarize()

runner.save_path ='output.pkl'

Number of dataframes: 5
Metrics used: confusion_matrix, accuracy_score, f1_score
Split method:
     - Holdouts: [0.3]
     - Folds: [3]
Seeds methods:
     - Each model will run 1 times witch seeds: [42]
     - Each split will run 1 times witch seeds: [42]
Models used: DecisionTreeClassifier, BaggingClassifier, Pipeline


In [30]:
#Acessando um dataset
runner.data.datasets[-1].get_feature()

Unnamed: 0,0,1,2,string_A,string_B,string_C
0,6.419566,3.608102,-2.575205,1,0,0
1,9.102536,4.767435,0.154750,0,1,0
2,8.295302,1.728823,-2.656761,0,0,1
3,-2.090848,-5.671698,8.577922,1,0,0
4,9.129001,8.540071,-1.147410,0,1,0
...,...,...,...,...,...,...
995,-2.603826,-6.909704,8.084810,0,0,1
996,-5.999522,7.049011,10.858013,1,0,0
997,4.682709,6.851010,-7.465668,0,1,0
998,-8.988807,-2.804796,0.008418,0,0,1


# Executando

Para os dados não serem perdidos ao acontecer um erro save_each_evaluation salva atualiza o arquivo result.pkl a cada execução.

In [31]:
runner.run(save_each_evaluation=True)

In [32]:
pd.DataFrame(runner.results)

Unnamed: 0,df_name,model_name,model_params,split_type,split_value,model_seed,split_seed,confusion_matrix,accuracy_score,f1_score
0,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42.0,42,"[[61, 6, 0, 0, 0], [2, 58, 0, 0, 0], [0, 0, 38...",0.696667,0.683968
1,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42.0,42,"[[66, 11, 0, 0, 0], [2, 60, 0, 0, 0], [0, 0, 4...",0.682635,0.675196
2,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_2/3,3.0,42.0,42,"[[54, 1, 0, 0, 0], [7, 63, 0, 1, 0], [0, 0, 38...",0.693694,0.703183
3,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_3/3,3.0,42.0,42,"[[68, 4, 0, 0, 0], [3, 60, 0, 0, 0], [0, 0, 37...",0.732733,0.730356
4,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",holdout,0.3,42.0,42,"[[61, 7, 0, 0, 0], [2, 59, 0, 0, 0], [0, 0, 35...",0.736667,0.727318
5,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_1/3,3.0,42.0,42,"[[64, 6, 0, 0, 0], [4, 65, 0, 0, 0], [0, 0, 43...",0.745509,0.741444
6,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_2/3,3.0,42.0,42,"[[57, 0, 0, 0, 0], [4, 65, 0, 0, 0], [0, 0, 46...",0.72973,0.737694
7,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_3/3,3.0,42.0,42,"[[71, 2, 0, 0, 0], [0, 62, 0, 0, 0], [0, 0, 32...",0.762763,0.757873
8,bases/df1.csv,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",holdout,0.3,,42,"[[62, 8, 0, 0, 0], [1, 58, 0, 0, 0], [0, 0, 41...",0.756667,0.748106
9,bases/df1.csv,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",fold_1/3,3.0,,42,"[[66, 8, 0, 0, 0], [2, 63, 0, 0, 0], [0, 0, 48...",0.766467,0.761614


In [33]:
pd.read_pickle("result.pkl")

Unnamed: 0,df_name,model_name,model_params,split_type,split_value,model_seed,split_seed,confusion_matrix,accuracy_score,f1_score
0,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42,42,"[[103, 0, 0], [0, 99, 0], [0, 0, 98]]",1.0,1.0
1,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42,42,"[[110, 0, 0], [0, 110, 1], [0, 0, 113]]",0.997006,0.997006
2,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_2/3,3.0,42,42,"[[118, 0, 0], [0, 113, 0], [0, 0, 102]]",1.0,1.0
3,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_3/3,3.0,42,42,"[[106, 0, 0], [0, 109, 0], [0, 1, 117]]",0.996997,0.996997
4,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",holdout,0.3,42,42,"[[103, 0, 0], [0, 99, 0], [0, 0, 98]]",1.0,1.0
5,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_1/3,3.0,42,42,"[[110, 0, 0], [0, 110, 0], [0, 0, 114]]",1.0,1.0
6,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_2/3,3.0,42,42,"[[118, 0, 0], [0, 113, 0], [0, 0, 102]]",1.0,1.0
7,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_3/3,3.0,42,42,"[[106, 0, 0], [0, 110, 0], [0, 0, 117]]",1.0,1.0
8,bases/df2.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42,42,"[[94, 0, 0], [0, 105, 0], [0, 0, 101]]",1.0,1.0
9,bases/df2.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42,42,"[[112, 0, 0], [0, 113, 0], [0, 0, 109]]",1.0,1.0
