## Modelling

### Install Pycaret

In [None]:
# if on Colab, install pycaret and other dependencies
%pip install --pre pycaret
%pip install xgboost
%pip install shap
%pip install explainerdashboard
%pip install deepchecks

Collecting explainerdashboard
  Downloading explainerdashboard-0.4.7-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-auth (from explainerdashboard)
  Downloading dash_auth-2.3.0-py3-none-any.whl.metadata (10 kB)
Collecting dash-bootstrap-components>=1 (from explainerdashboard)
  Downloading dash_bootstrap_components-1.6.0-py3-none-any.whl.metadata (5.2 kB)
Collecting dtreeviz>=2.1 (from explainerdashboard)
  Downloading dtreeviz-2.2.2-py3-none-any.whl.metadata (2.4 kB)
Collecting flask-simplelogin (from explainerdashboard)
  Downloading flask_simplelogin-0.2.0-py3-none-any.whl.metadata (3.3 kB)
Collecting Flask-WTF>=1.1 (from explainerdashboard)
  Downloading flask_wtf-1.2.2-py3-none-any.whl.metadata (3.4 kB)
Collecting jupyter-dash>=0.4.1 (from explainerdashboard)
  Downloading jupyter_dash-0.4.2-py3-none-any.whl.metadata (3.6 kB)
Collecting oyaml (from explainerdashboard)
  Downloading oyaml-1.0-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting waitress (from explainerdashboard)
 

### Mount Google Collab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Imports

In [None]:
import numpy as np
import pandas as pd

### Load Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/cleaned_data.csv")
df.head()

Unnamed: 0,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,19.14,0.394962,0.883161,71745.401048,58143620.0,0
1,18.5,0.530341,1.185878,109949.757148,55801050.0,1
2,21.45,0.136319,0.304818,24865.506798,67206890.0,0
3,20.63,0.198863,0.444672,78890.076805,30396440.0,0
4,22.7,0.076658,0.171412,56036.519484,63118630.0,0


# Setup a new experiment

The only required parameters are 'data' and 'target'.

However, it is a good option to name your experiment. You can also specify a session_id, if you didn't by default a random seed is generated and returned in the Information grid. The unique number is then distributed as a seed in all functions used during the experiment. This can be used for later reproducibility of the entire experiment.

In [None]:
# init setup
from pycaret.classification import *
exp_name = setup(data = df,  target = 'is_hazardous',
                 session_id=123,
                 log_experiment=False,
                 experiment_name='exp_1',
                 use_gpu=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,is_hazardous
2,Target type,Binary
3,Original data shape,"(338171, 6)"
4,Transformed data shape,"(338171, 6)"
5,Transformed train set shape,"(236719, 6)"
6,Transformed test set shape,"(101452, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


### Automatically compare models
Get the list of models in library and their id that can be used in functions.


In [None]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


This function train all the models available in the model library and scores them using cross-validation. The output of this function is a scoring grid with average cross-validated scores.

In [None]:
# compare models
best = compare_models(exclude='catboost') # catboost will take too long to train.

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9166,0.9472,0.5891,0.7084,0.6432,0.5965,0.5997,19.059
rf,Random Forest Classifier,0.9139,0.9452,0.563,0.7032,0.6253,0.5773,0.5819,53.739
xgboost,Extreme Gradient Boosting,0.8918,0.9132,0.2648,0.7017,0.3845,0.3382,0.387,2.072
lightgbm,Light Gradient Boosting Machine,0.8886,0.9067,0.2066,0.7216,0.3212,0.2803,0.3467,9.866
gbc,Gradient Boosting Classifier,0.8856,0.8956,0.16,0.7412,0.2631,0.2282,0.3095,57.364
dt,Decision Tree Classifier,0.885,0.7451,0.5572,0.549,0.553,0.487,0.4871,2.023
ada,Ada Boost Classifier,0.8838,0.8856,0.1537,0.707,0.2525,0.2168,0.2933,12.203
dummy,Dummy Classifier,0.8724,0.5,0.0,0.0,0.0,0.0,0.0,0.127
ridge,Ridge Classifier,0.8723,0.8347,0.0,0.0333,0.0001,-0.0001,-0.0014,0.146
qda,Quadratic Discriminant Analysis,0.8721,0.8512,0.0042,0.442,0.0083,0.0058,0.0287,0.173


Processing:   0%|          | 0/33 [00:00<?, ?it/s]

### Manually train most promising models with custom parameters

Train again some of the best models. The accuracy obtained by default will be the same obtained with the `compare_models` function with the default config. However, in this case it is possible to provide custom hyperparameters to improve the model.  

In [None]:
# train et
et = create_model('et',
                   fold = 10,
                   return_train_score = True) # this allows checking overfitting

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(et)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)


In [None]:
# train rf
rf = create_model('rf',
                   fold = 10,
                   n_estimators = 100,
                   max_depth=6,
                   return_train_score = True) # this allows checking overfitting

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8847,0.8904,0.1406,0.7634,0.2375,0.2059,0.2956
CV-Train,1,0.8849,0.8903,0.1443,0.7571,0.2424,0.2101,0.2979
CV-Train,2,0.885,0.8902,0.1473,0.753,0.2464,0.2136,0.2999
CV-Train,3,0.8848,0.8903,0.1415,0.7619,0.2387,0.207,0.2962
CV-Train,4,0.8847,0.8902,0.1435,0.7535,0.2411,0.2088,0.2961
CV-Train,5,0.8851,0.8904,0.1467,0.7586,0.2459,0.2134,0.3008
CV-Train,6,0.885,0.8904,0.1456,0.7563,0.2442,0.2117,0.299
CV-Train,7,0.8848,0.8908,0.1451,0.7533,0.2433,0.2108,0.2977
CV-Train,8,0.8849,0.8905,0.1438,0.7579,0.2418,0.2096,0.2976
CV-Train,9,0.885,0.8901,0.1453,0.7579,0.2438,0.2114,0.2991


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(rf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=6, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=123, verbose=0,
                       warm_start=False)


In [None]:
# train xgboost
xgb = create_model('xgboost',
                   fold = 10,
                   max_depth=4,
                   return_train_score = True) # this allows checking overfitting

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8903,0.9115,0.2172,0.7399,0.3359,0.295,0.3621
CV-Train,1,0.8903,0.9114,0.2215,0.7331,0.3403,0.2987,0.3634
CV-Train,2,0.8905,0.9119,0.2219,0.7359,0.341,0.2996,0.3647
CV-Train,3,0.8899,0.9108,0.216,0.7334,0.3337,0.2927,0.3588
CV-Train,4,0.891,0.9127,0.2286,0.7338,0.3486,0.3065,0.3696
CV-Train,5,0.8911,0.9121,0.2272,0.7385,0.3475,0.3058,0.3701
CV-Train,6,0.8909,0.9122,0.2294,0.7309,0.3492,0.3069,0.3693
CV-Train,7,0.8901,0.9112,0.2178,0.7337,0.3359,0.2947,0.3605
CV-Train,8,0.8911,0.9127,0.2323,0.7308,0.3525,0.31,0.3717
CV-Train,9,0.8913,0.9126,0.2285,0.7394,0.3491,0.3074,0.3715


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(xgb)

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, objective='binary:logistic', ...)


### Automatically optimize models

This function tunes the hyperparameters of the model. The output of this function is a scoring grid with cross-validated scores by fold. The best model is selected based on the metric defined in optimize parameter.

In [None]:
tuned_et = tune_model(et, return_train_score = True, n_iter = 100) # n_iter is the number of models to be tested

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 10.3 µs


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8864,0.9054,0.1566,0.7716,0.2603,0.227,0.3147
CV-Train,1,0.8867,0.9052,0.1599,0.7713,0.2649,0.2312,0.318
CV-Train,2,0.8865,0.9061,0.155,0.7784,0.2585,0.2257,0.3149
CV-Train,3,0.8867,0.9051,0.1562,0.78,0.2603,0.2274,0.3167
CV-Train,4,0.8864,0.9061,0.1559,0.7715,0.2594,0.2262,0.314
CV-Train,5,0.8868,0.9063,0.1585,0.7782,0.2633,0.2301,0.3185
CV-Train,6,0.8867,0.9063,0.1571,0.7782,0.2615,0.2284,0.3171
CV-Train,7,0.8865,0.9055,0.1591,0.7668,0.2635,0.2297,0.3159
CV-Train,8,0.8866,0.9059,0.1562,0.7786,0.2602,0.2273,0.3163
CV-Train,9,0.8872,0.9061,0.1637,0.7762,0.2703,0.2364,0.3232


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
# compare the hyperparameters
print(et) # default model
print(tuned_et) # tuned model

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, ve