In [5]:

from dynapipe.utilis_func import pipeline_splitting_rule, update_parameters,reset_parameters
reset_parameters()

update_parameters(mode = "cls", estimator_name = "mlp", hidden_layer_sizes = [10],activation=["relu"],learning_rate = ["constant"],solver = ["sgd"])
update_parameters(mode = "cls", estimator_name = "svm", C=[0.1],kernel=["linear"])
update_parameters(mode = "cls", estimator_name = "ada", n_estimators =[50],learning_rate=[1])
update_parameters(mode = "cls", estimator_name = "rf", n_estimators =[50],max_depth=[2])
update_parameters(mode = "cls", estimator_name = "gb", n_estimators =[50],max_depth=[2],learning_rate=[1])
update_parameters(mode = "cls", estimator_name = "xgb", n_estimators =[50],max_depth=[2],learning_rate=[1])

from dynapipe.autoPipe import autoPipe
import pandas as pd
from dynapipe.funcPP import PPtools
from dynapipe.autoPP import dynaPreprocessing

from dynapipe.autoFS import dynaFS_clf
from dynapipe.autoCV import evaluate_model,dynaClassifier,dynaRegressor
df = pd.read_csv('./data/preprocessing/breast_cancer.csv')
custom_parameters = {
    "scaler" : ["None", "standard"],
    # threshold number of category dimension
    "encode_band" : [10],
    # low dimension encoding
    "low_encode" : ["onehot","label"], 
    # high dimension encoding
    "high_encode" : ["frequency", "mean"],
    "winsorizer" : [(0.05,0.05)],
    "sparsity" : [0.75],
    "cols" : [50]
}

Done with the parameters reset.
Previous Parameters are: {'hidden_layer_sizes': [10, 50, 100], 'activation': ['identity', 'relu', 'tanh', 'logistic'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'solver': ['lbfgs', 'sgd', 'adam']}
Current Parameters are updated as: {'hidden_layer_sizes': [10], 'activation': ['relu'], 'learning_rate': ['constant'], 'solver': ['sgd']}
Done with the parameters update.
Previous Parameters are: {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.1, 1, 10]}
Current Parameters are updated as: {'C': [0.1], 'kernel': ['linear']}
Done with the parameters update.
Previous Parameters are: {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 1, 10, 100]}
Current Parameters are updated as: {'n_estimators': [50], 'learning_rate': [1]}
Done with the parameters update.
Previous Parameters are: {'n_estimators': [5, 50, 250], 'max_depth': [2, 4, 8, 16, 32]}
Current Parameters are updated as: {'n_estimators': [50], 'max_depth': [2]}
Done with the para

In [6]:
pipe = autoPipe(
[("autoPP",dynaPreprocessing(custom_parameters = custom_parameters, label_col = 'diagnosis', model_type = "cls")),
("datasets_splitting",pipeline_splitting_rule(val_size = 0.2, test_size = 0.2, random_state = 13)),
("autoFS",dynaFS_clf(fs_num = 5, random_state=13, cv = 5, in_pipeline = True, input_from_file = False)),
("autoCV",dynaClassifier(random_state = 13,cv_num = 5,in_pipeline = True, input_from_file = False)),
("model_evaluate",evaluate_model(model_type = "cls"))])

In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns',None,'display.max_rows',None)
pd.set_option('max_colwidth', -1)

DICT_PREPROCESSING,DICT_FEATURE_SELECTION,DICT_MODELS_EVALUATION,DICT_DATA,dyna_report= pipe.fit(df)

Now in Progress - autoFS & autoCV Iteration: Estimate about 0.0 minutes left  [####################] 100.0%
The top 5 Models with Best Performance Metrics:
      Dataset Model_Name  \
1   Dataset_0  svm         
6   Dataset_0  xgb         
55  Dataset_7  xgb         
8   Dataset_1  svm         
22  Dataset_3  svm         

                                                                         Best_Parameters  \
1   [('C', 0.1), ('kernel', 'linear')]                                                     
6   [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]   
55  [('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]   
8   [('C', 0.1), ('kernel', 'linear')]                                                     
22  [('C', 0.1), ('kernel', 'linear')]                                                     

    Accuracy  Precision  Recall  Latency  
1   0.930     0.889      0.96    3.0      
6   0.912     0.955      0.84    2.4

In [8]:

DICT_MODELS_EVALUATION['Dataset_0']

Unnamed: 0,Model_Name,Accuracy,Precision,Recall,Latency,Best_Parameters,Dataset
0,lgr,0.877,0.875,0.84,5.0,"[('C', 1000), ('random_state', 13)]",Dataset_0
0,svm,0.93,0.889,0.96,3.0,"[('C', 0.1), ('kernel', 'linear')]",Dataset_0
0,mlp,0.596,0.583,0.28,3.0,"[('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')]",Dataset_0
0,rf,0.877,0.821,0.92,11.0,"[('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",Dataset_0
0,ada,0.895,0.913,0.84,16.0,"[('learning_rate', 1), ('n_estimators', 50), ('random_state', 13)]",Dataset_0
0,gb,0.895,0.913,0.84,1.0,"[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",Dataset_0
0,xgb,0.912,0.955,0.84,2.4,"[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",Dataset_0


In [9]:

dyna_report

Unnamed: 0,Dataset,Model_Name,Best_Parameters,Accuracy,Precision,Recall,Latency
1,Dataset_0,svm,"[('C', 0.1), ('kernel', 'linear')]",0.93,0.889,0.96,3.0
6,Dataset_0,xgb,"[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",0.912,0.955,0.84,2.4
55,Dataset_7,xgb,"[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",0.912,0.955,0.84,3.0
8,Dataset_1,svm,"[('C', 0.1), ('kernel', 'linear')]",0.912,0.917,0.88,2.0
22,Dataset_3,svm,"[('C', 0.1), ('kernel', 'linear')]",0.912,0.917,0.88,4.0
9,Dataset_1,mlp,"[('activation', 'relu'), ('hidden_layer_sizes', (10,)), ('learning_rate', 'constant'), ('random_state', 13), ('solver', 'sgd')]",0.895,0.952,0.8,3.0
5,Dataset_0,gb,"[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",0.895,0.913,0.84,1.0
49,Dataset_7,lgr,"[('C', 0.1), ('random_state', 13)]",0.895,0.913,0.84,3.0
50,Dataset_7,svm,"[('C', 0.1), ('kernel', 'linear')]",0.895,0.913,0.84,3.0
54,Dataset_7,gb,"[('learning_rate', 1), ('max_depth', 2), ('n_estimators', 50), ('random_state', 13)]",0.895,0.913,0.84,3.0


In [15]:
DICT_DATA['Dataset_0']['DICT_TEST']["X"].head(10)

Unnamed: 0,concavity_mean,concave points_mean,perimeter_mean,radius_mean,texture_mean
264,0.09061,0.06527,111.6,17.19,22.07
231,0.01633,0.006588,71.76,11.32,26.6
197,0.1103,0.05778,117.4,18.08,21.84
172,0.2032,0.1097,102.5,15.46,13.04
54,0.05253,0.03334,97.26,15.1,22.02
33,0.1657,0.07593,127.9,19.27,26.47
68,0.2508,0.04375,60.73,9.72,17.33
237,0.09042,0.06022,132.5,20.48,21.46
51,0.01857,0.01723,87.21,13.64,16.34
196,0.1385,0.06526,90.63,13.77,22.29


In [21]:
test = DICT_PREPROCESSING['Dataset_0']
test

"winsor_0-Scaler_None-- Encoded Features:['diagnosis', 'Size_3', 'area_mean', 'compactness_mean', 'concave points_mean', 'concavity_mean', 'fractal_dimension_mean', 'perimeter_mean', 'radius_mean', 'smoothness_mean', 'symmetry_mean', 'texture_mean', 'onehot_Age_20-29', 'onehot_Age_30-39', 'onehot_Age_40-49', 'onehot_Age_50-59', 'onehot_Age_60-69', 'onehot_Age_70-79', 'Label_Position_1', 'onehot_Position_2_NaN', 'onehot_Position_2_central', 'onehot_Position_2_left_low', 'onehot_Position_2_left_up', 'onehot_Position_2_right_low', 'onehot_Position_2_right_up', 'Frequency_Size_1', 'onehot_Size_2_0-2', 'onehot_Size_2_12-14', 'onehot_Size_2_15-17', 'onehot_Size_2_24-26', 'onehot_Size_2_3-5', 'onehot_Size_2_6-8', 'onehot_Size_2_9-11', 'Label_Treatment', 'onehot_Type_1_ge40', 'onehot_Type_1_lt40', 'onehot_Type_1_premeno', 'onehot_Type_2_NaN', 'onehot_Type_2_no', 'onehot_Type_2_yes', 'Label_Type_3']"

In [44]:
import re
import pandas as pd

columns = ["Dataset","Encode_low_dimension","Encode_high_dimension","Winsorize","Scale"]
df_pp = pd.DataFrame(columns=columns)

for i in list(DICT_PREPROCESSING.keys()):
    row_pp = [i]
    s = DICT_PREPROCESSING[i]
    ext = re.search("Encoded Features:(.*)']", s).group(1)
    if "onehot_" in ext:
        row_pp.append('onehot')
    elif "label_" in ext:
        row_pp.append('label')
    else:
        row_pp.append('None')

    if "frequency_" in ext:
        row_pp.append('frequency')
    elif "mean_" in ext:
        row_pp.append('mean')
    else:
        row_pp.append('None')

    row_pp.append(re.search('winsor_(.*)-Scaler', s).group(1))
    row_pp.append(re.search('winsor_0-Scaler_(.*)-- ', s).group(1))
    df_pp.loc[len(df_pp)] = row_pp

df_report_Accuracy = df_pp.merge(dyna_report[['Dataset','Accuracy']], how = 'left', on = 'Dataset')
bins = [0, 0.70, 0.90, 1]
labels = ["Low Accuracy","High Accuracy","Top Accuracy"]
df_report_Accuracy['Level'] = pd.cut(df_report_Accuracy['Accuracy'], bins=bins, labels=labels)
df_report_Accuracy.head(3)

df_report_Precision = df_pp.merge(dyna_report[['Dataset','Precision']], how = 'left', on = 'Dataset')
bins = [0, 0.70, 0.90, 1]
labels = ["Low Precision","High Precision","Top Precision"]
df_report_Precision['Level'] = pd.cut(df_report_Precision['Precision'], bins=bins, labels=labels)
df_report_Precision.head(3)

df_report_Recall = df_pp.merge(dyna_report[['Dataset','Recall']], how = 'left', on = 'Dataset')
bins = [0, 0.70, 0.90, 1]
labels = ["Low Recall","High Recall","Top Recall"]
df_report_Recall['Level'] = pd.cut(df_report_Recall['Recall'], bins=bins, labels=labels)
df_report_Recall.head(3)


Unnamed: 0,Dataset,Encode_low_dimension,Encode_high_dimension,Winsorize,Scale,Accuracy,Level
0,Dataset_0,onehot,,0,,0.93,Top Accuracy
1,Dataset_0,onehot,,0,,0.912,Top Accuracy
2,Dataset_0,onehot,,0,,0.895,High Accuracy


Unnamed: 0,Dataset,Encode_low_dimension,Encode_high_dimension,Winsorize,Scale,Precision,Level
0,Dataset_0,onehot,,0,,0.889,High Precision
1,Dataset_0,onehot,,0,,0.955,Top Precision
2,Dataset_0,onehot,,0,,0.913,Top Precision


Unnamed: 0,Dataset,Encode_low_dimension,Encode_high_dimension,Winsorize,Scale,Recall,Level
0,Dataset_0,onehot,,0,,0.96,Top Recall
1,Dataset_0,onehot,,0,,0.84,High Recall
2,Dataset_0,onehot,,0,,0.84,High Recall


In [39]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["A1", "A2", "B1", "B2", "C1", "C2"],
      color = "blue"
    ),
    link = dict(
      source = [0, 1, 0, 2, 3, 3], # indices correspond to labels, eg A1, A2, A2, B1, ...
      target = [2, 3, 3, 4, 4, 5],
      value = [8, 4, 2, 8, 4, 2]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)

'pip' is not recognized as an internal or external command,
operable program or batch file.


In [13]:
a = {'learning_rate': 1, 'max_depth': 2, 'n_estimators': 50, 'random_state': 13}
lis = a.items()
[i for i in lis]

[('learning_rate', 1),
 ('max_depth', 2),
 ('n_estimators', 50),
 ('random_state', 13)]