# Creating Examples as datasets

In [37]:
!mkdir bases

mkdir: cannot create directory ‘bases’: File exists


In [38]:
from sklearn.datasets import make_blobs
import pandas as pd
for i in range(2):
    i+=1
    X, y = make_blobs(n_samples=1000, centers=3, n_features=5)
    X = pd.DataFrame(X)
    
    # Define the mapping of values to replacements
    replacement_mapping = {
        0: 'A',
        1: 'B',
        2: 'C'
    }

    # Replace values in the list
    y = [replacement_mapping.get(value, value) for value in y]
    X['string'] = [replacement_mapping[i%3] for i in range(len(X))]
    X['class'] = y
    X.to_csv(f'bases/df{i}.csv',index=False)

In [39]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier as knn
from experimenter import ModelRunner

# Configurando

In [40]:
runner = ModelRunner()

#Bases
runner.data.add_datasets(['bases/', #Path to folder
                          'bases/df1.csv', #Path to file
                          (X,'X'), #Dataframe with name
                          X #Dataframe without name
                          ])

#Splits
runner.splits.add_holdout([.3]) #Holdouts
runner.splits.add_fold([3]) #Kfolds


#Modelos
runner.models.add_models(['DecisionTreeClassifier', 
                         BaggingClassifier(estimator=knn())]
                         )

#Métricas
runner.metrics.add_score(['matrix',
                          'acc',
                          'f1_micro'] #Equivalente a  f1_score(y_true, y_pred, average='micro'), f1_score só funciona para binários
                          )

#Seeds
runner.random.add_model_seed([42])
#runner.random.add_model_seed([42,43])
#runner.random.add_split_seed([10,12])

runner.summarize()

runner.save_path ='output.pkl'

Number of dataframes: 5
Metrics used: confusion_matrix, accuracy_score, f1_score
Split method:
     - Holdouts: [0.3]
     - Folds: [3]
Seeds methods:
     - Each model will run 1 times witch seeds: [42]
     - Each split will run 1 times witch seeds: [42]
Models used: DecisionTreeClassifier, BaggingClassifier


In [41]:
#Acessando um dataset
runner.data.datasets[-1].get_feature()

Unnamed: 0,0,1,2,3,4,string_A,string_B,string_C
0,10.894861,-10.608544,-6.132924,5.129899,-5.648601,1,0,0
1,5.869636,-8.667056,-7.466404,-5.932534,9.594145,0,1,0
2,6.401782,-9.521154,-9.141618,-4.907643,8.644634,0,0,1
3,-4.709663,9.618539,6.923200,-5.326369,8.693210,1,0,0
4,8.232318,-9.408058,-5.796311,4.753442,-5.823453,0,1,0
...,...,...,...,...,...,...,...,...
995,10.340165,-7.780962,-5.566063,4.118860,-6.678683,0,0,1
996,-5.946630,8.127786,5.825708,-4.403631,8.628880,1,0,0
997,-4.237625,8.953113,7.578973,-4.661400,8.240181,0,1,0
998,7.906152,-10.206484,-10.394781,-7.155774,8.854445,0,0,1


# Executando

Para os dados não serem perdidos ao acontecer um erro save_each_evaluation salva atualiza o arquivo result.pkl a cada execução.

In [42]:
runner.run(save_each_evaluation=True)

In [43]:
pd.DataFrame(runner.results)

Unnamed: 0,df_name,model_name,model_params,split_type,split_value,model_seed,split_seed,confusion_matrix,accuracy_score,f1_score
0,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42,42,"[[94, 0, 0], [0, 104, 0], [0, 0, 102]]",1.0,1.0
1,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42,42,"[[112, 0, 0], [0, 112, 0], [0, 0, 110]]",1.0,1.0
2,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_2/3,3.0,42,42,"[[119, 0, 0], [0, 102, 0], [0, 0, 112]]",1.0,1.0
3,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_3/3,3.0,42,42,"[[103, 0, 0], [0, 119, 0], [0, 0, 111]]",1.0,1.0
4,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",holdout,0.3,42,42,"[[94, 0, 0], [0, 104, 0], [0, 0, 102]]",1.0,1.0
5,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_1/3,3.0,42,42,"[[112, 0, 0], [0, 112, 0], [0, 0, 110]]",1.0,1.0
6,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_2/3,3.0,42,42,"[[119, 0, 0], [0, 102, 0], [0, 0, 112]]",1.0,1.0
7,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_3/3,3.0,42,42,"[[103, 0, 0], [0, 119, 0], [0, 0, 111]]",1.0,1.0
8,bases/df2.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42,42,"[[100, 0, 0], [0, 103, 0], [0, 0, 97]]",1.0,1.0
9,bases/df2.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42,42,"[[109, 0, 0], [0, 114, 0], [0, 0, 111]]",1.0,1.0


In [44]:
pd.read_pickle("result.pkl")

Unnamed: 0,df_name,model_name,model_params,split_type,split_value,model_seed,split_seed,confusion_matrix,accuracy_score,f1_score
0,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42,42,"[[103, 0, 0], [0, 99, 0], [0, 0, 98]]",1.0,1.0
1,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42,42,"[[110, 0, 0], [0, 110, 1], [0, 0, 113]]",0.997006,0.997006
2,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_2/3,3.0,42,42,"[[118, 0, 0], [0, 113, 0], [0, 0, 102]]",1.0,1.0
3,bases/df1.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_3/3,3.0,42,42,"[[106, 0, 0], [0, 109, 0], [0, 1, 117]]",0.996997,0.996997
4,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",holdout,0.3,42,42,"[[103, 0, 0], [0, 99, 0], [0, 0, 98]]",1.0,1.0
5,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_1/3,3.0,42,42,"[[110, 0, 0], [0, 110, 0], [0, 0, 114]]",1.0,1.0
6,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_2/3,3.0,42,42,"[[118, 0, 0], [0, 113, 0], [0, 0, 102]]",1.0,1.0
7,bases/df1.csv,BaggingClassifier,"{'base_estimator': 'deprecated', 'bootstrap': ...",fold_3/3,3.0,42,42,"[[106, 0, 0], [0, 110, 0], [0, 0, 117]]",1.0,1.0
8,bases/df2.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",holdout,0.3,42,42,"[[94, 0, 0], [0, 105, 0], [0, 0, 101]]",1.0,1.0
9,bases/df2.csv,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",fold_1/3,3.0,42,42,"[[112, 0, 0], [0, 113, 0], [0, 0, 109]]",1.0,1.0
