## Modelling

### Install Pycaret

In [None]:
# if on Colab, install pycaret and other dependencies
%pip install --pre pycaret
%pip install xgboost
%pip install deepchecks



### Mount Google Collab

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Imports

In [13]:
import numpy as np
import pandas as pd

### Load Dataset

In [14]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/cleaned_data.csv")
df.head()

Unnamed: 0,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,19.14,0.394962,0.883161,71745.401048,58143620.0,0
1,18.5,0.530341,1.185878,109949.757148,55801050.0,1
2,21.45,0.136319,0.304818,24865.506798,67206890.0,0
3,20.63,0.198863,0.444672,78890.076805,30396440.0,0
4,22.7,0.076658,0.171412,56036.519484,63118630.0,0


## Feature Engineering
Removing less important features such as `estimated_diameter_min`.

In [15]:
new_df = df.drop(['estimated_diameter_min'], axis=1)
new_df.head()

Unnamed: 0,absolute_magnitude,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,19.14,0.883161,71745.401048,58143620.0,0
1,18.5,1.185878,109949.757148,55801050.0,1
2,21.45,0.304818,24865.506798,67206890.0,0
3,20.63,0.444672,78890.076805,30396440.0,0
4,22.7,0.171412,56036.519484,63118630.0,0


Transforming the values of the `miss_distance` column

In [16]:
new_df['miss_distance'] = np.log1p(new_df['miss_distance'])
new_df

Unnamed: 0,absolute_magnitude,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,19.140,0.883161,71745.401048,17.878427,0
1,18.500,1.185878,109949.757148,17.837303,1
2,21.450,0.304818,24865.506798,18.023286,0
3,20.630,0.444672,78890.076805,17.229836,0
4,22.700,0.171412,56036.519484,17.960526,0
...,...,...,...,...,...
338166,28.580,0.011430,56646.985988,17.975416,0
338167,28.690,0.010865,21130.768947,17.199522,0
338168,21.919,0.245607,11832.041031,17.794459,0
338169,23.887,0.099229,56198.382733,15.461231,0


# Setup a new experiment

The only required parameters are 'data' and 'target'.

However, it is a good option to name your experiment. You can also specify a session_id, if you didn't by default a random seed is generated and returned in the Information grid. The unique number is then distributed as a seed in all functions used during the experiment. This can be used for later reproducibility of the entire experiment.

In [17]:
# init setup
from pycaret.classification import *
exp_name = setup(data = new_df,  target = 'is_hazardous',
                 session_id=123,
                 log_experiment=False,
                 experiment_name='exp_1',
                 use_gpu=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,is_hazardous
2,Target type,Binary
3,Original data shape,"(338171, 5)"
4,Transformed data shape,"(338171, 5)"
5,Transformed train set shape,"(236719, 5)"
6,Transformed test set shape,"(101452, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


### Automatically compare models
Get the list of models in library and their id that can be used in functions.


In [18]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


This function train all the models available in the model library and scores them using cross-validation. The output of this function is a scoring grid with average cross-validated scores.

In [19]:
# compare models
best = compare_models(exclude='catboost') # catboost will take too long to train.

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9154,0.9464,0.5649,0.7125,0.6301,0.5831,0.5881,20.775
rf,Random Forest Classifier,0.912,0.9432,0.5365,0.7036,0.6087,0.5601,0.5667,58.853
xgboost,Extreme Gradient Boosting,0.8919,0.9129,0.2636,0.705,0.3837,0.3377,0.3874,1.771
lightgbm,Light Gradient Boosting Machine,0.8886,0.9067,0.2073,0.7217,0.322,0.2811,0.3472,4.839
gbc,Gradient Boosting Classifier,0.8856,0.8955,0.1599,0.7413,0.263,0.228,0.3094,48.555
dt,Decision Tree Classifier,0.8852,0.7458,0.5586,0.5497,0.5541,0.4882,0.4883,1.528
ada,Ada Boost Classifier,0.8838,0.8856,0.1537,0.707,0.2525,0.2168,0.2933,10.698
lr,Logistic Regression,0.8758,0.8411,0.0925,0.5863,0.1598,0.1295,0.1971,3.673
ridge,Ridge Classifier,0.8724,0.8355,0.0003,0.425,0.0006,0.0004,0.0082,0.241
dummy,Dummy Classifier,0.8724,0.5,0.0,0.0,0.0,0.0,0.0,0.19


Processing:   0%|          | 0/33 [00:00<?, ?it/s]

### Manually train most promising models with custom parameters

Train again some of the best models. The accuracy obtained by default will be the same obtained with the `compare_models` function with the default config. However, in this case it is possible to provide custom hyperparameters to improve the model.  

In [20]:
# train et
et = create_model('et',
                   fold = 10,
                   return_train_score = True) # this allows checking overfitting

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CV-Train,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [21]:
print(et)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)


In [22]:
# train rf
rf = create_model('rf',
                   fold = 10,
                   n_estimators = 100,
                   max_depth=6,
                   return_train_score = True) # this allows checking overfitting

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8848,0.8905,0.1434,0.7565,0.241,0.2089,0.2967
CV-Train,1,0.8848,0.8905,0.146,0.7513,0.2445,0.2117,0.2981
CV-Train,2,0.885,0.8898,0.1466,0.7537,0.2454,0.2127,0.2993
CV-Train,3,0.8849,0.8903,0.1435,0.7605,0.2414,0.2094,0.2979
CV-Train,4,0.8848,0.8896,0.146,0.7506,0.2445,0.2117,0.2979
CV-Train,5,0.8852,0.8901,0.1491,0.7553,0.249,0.216,0.3024
CV-Train,6,0.885,0.8902,0.1448,0.7586,0.2432,0.2109,0.2988
CV-Train,7,0.8849,0.89,0.1461,0.7527,0.2447,0.212,0.2986
CV-Train,8,0.8851,0.8904,0.1468,0.7559,0.2459,0.2132,0.3002
CV-Train,9,0.8851,0.8901,0.1476,0.7564,0.247,0.2143,0.3012


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
print(rf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=6, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=123, verbose=0,
                       warm_start=False)


In [24]:
# train xgboost
xgb = create_model('xgboost',
                   fold = 10,
                   max_depth=4,
                   return_train_score = True) # this allows checking overfitting

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8909,0.9124,0.2241,0.7404,0.3441,0.3028,0.3682
CV-Train,1,0.8907,0.9126,0.2258,0.7329,0.3452,0.3033,0.367
CV-Train,2,0.8906,0.9116,0.225,0.7328,0.3443,0.3025,0.3663
CV-Train,3,0.8901,0.9109,0.2166,0.7367,0.3347,0.2938,0.3604
CV-Train,4,0.8911,0.9128,0.232,0.7322,0.3524,0.31,0.372
CV-Train,5,0.8909,0.9128,0.2279,0.7331,0.3477,0.3057,0.3688
CV-Train,6,0.891,0.9123,0.2314,0.7304,0.3514,0.309,0.3708
CV-Train,7,0.8904,0.9116,0.2244,0.7305,0.3434,0.3015,0.365
CV-Train,8,0.8914,0.9125,0.2324,0.7368,0.3533,0.3111,0.3739
CV-Train,9,0.8909,0.9122,0.2291,0.7328,0.3491,0.307,0.3698


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [25]:
print(xgb)

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, objective='binary:logistic', ...)


### Automatically optimize models

This function tunes the hyperparameters of the model. The output of this function is a scoring grid with cross-validated scores by fold. The best model is selected based on the metric defined in optimize parameter.

In [26]:
tuned_xgb = tune_model(xgb, return_train_score = True, n_iter = 100) # n_iter is the number of models to be tested

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Split,Fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CV-Train,0,0.8953,0.9221,0.278,0.7393,0.4041,0.3594,0.4116
CV-Train,1,0.896,0.9233,0.2937,0.7301,0.4188,0.3729,0.42
CV-Train,2,0.8957,0.9218,0.2834,0.7374,0.4095,0.3644,0.4151
CV-Train,3,0.895,0.9209,0.2765,0.7358,0.4019,0.3571,0.4091
CV-Train,4,0.8965,0.9222,0.2902,0.7414,0.4171,0.372,0.4218
CV-Train,5,0.8959,0.9225,0.2847,0.739,0.411,0.366,0.4166
CV-Train,6,0.896,0.9233,0.287,0.7388,0.4134,0.3683,0.4183
CV-Train,7,0.8961,0.9227,0.2939,0.732,0.4194,0.3736,0.4209
CV-Train,8,0.8967,0.9231,0.2933,0.7409,0.4202,0.375,0.424
CV-Train,9,0.8964,0.9226,0.2869,0.7447,0.4142,0.3695,0.4205


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [27]:
# compare the hyperparameters
print(xgb) # default model
print(tuned_xgb) # tuned model

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, objective='binary:logistic', ...)
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_catego