In [6]:
# Code Source From: https://towardsdatascience.com/pycaret-and-streamlit-how-to-create-and-deploy-data-science-web-app-273d205271a3
# Data From: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

wine_df = pd.read_csv('./data/winequality_red.csv')
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [10]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [11]:
wine_df['quality'] = np.where(wine_df.quality >= 6, 'Good', 'Bad')
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,Bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,Bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,Good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Bad


In [13]:
from pycaret.classification import *

In [14]:
# initial setup out of the box...
classifier_setup = setup(data = wine_df
                         , target = 'quality'
                         , session_id = 2020
                        )

Unnamed: 0,Description,Value
0,session_id,2020
1,Target,quality
2,Target Type,Binary
3,Label Encoded,"Bad: 0, Good: 1"
4,Original Data,"(1599, 12)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [15]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7998,0.8903,0.8123,0.8154,0.8125,0.5977,0.5998,0.106
catboost,CatBoost Classifier,0.7989,0.8646,0.7907,0.8292,0.8079,0.5971,0.6001,1.341
lightgbm,Light Gradient Boosting Machine,0.7846,0.862,0.7972,0.8033,0.7979,0.5672,0.571,0.041
rf,Random Forest Classifier,0.7837,0.8786,0.7858,0.8087,0.7943,0.5663,0.5702,0.138
xgboost,Extreme Gradient Boosting,0.773,0.8558,0.779,0.7955,0.7846,0.5447,0.5484,13.845
gbc,Gradient Boosting Classifier,0.7721,0.844,0.7757,0.7957,0.7846,0.5428,0.5446,0.087
ridge,Ridge Classifier,0.7525,0.0,0.7624,0.7748,0.7676,0.5028,0.5042,0.01
lr,Logistic Regression,0.7489,0.8175,0.7557,0.7728,0.7634,0.4959,0.4972,0.317
lda,Linear Discriminant Analysis,0.7489,0.8167,0.7574,0.7718,0.7635,0.4959,0.4973,0.012
nb,Naive Bayes,0.7292,0.7996,0.7327,0.7579,0.7432,0.4569,0.4596,0.011


In [16]:
# new tuned setup
classifier_setup2 = setup(data = wine_df
                          , target = 'quality'
                          , session_id = 2021
                          , normalize = True
                          , transformation = True
                         )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,quality
2,Target Type,Binary
3,Label Encoded,"Bad: 0, Good: 1"
4,Original Data,"(1599, 12)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [18]:
best2 = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8052,0.8923,0.8203,0.8162,0.8162,0.6089,0.6126,0.132
rf,Random Forest Classifier,0.7989,0.881,0.8136,0.8102,0.81,0.5964,0.5998,0.142
catboost,CatBoost Classifier,0.782,0.8673,0.7932,0.7998,0.7935,0.5624,0.5674,1.29
xgboost,Extreme Gradient Boosting,0.7793,0.8484,0.7949,0.7915,0.7913,0.557,0.5601,15.027
gbc,Gradient Boosting Classifier,0.7784,0.8504,0.7881,0.795,0.7895,0.5554,0.5586,0.116
lightgbm,Light Gradient Boosting Machine,0.7757,0.8574,0.7915,0.7889,0.7884,0.5497,0.5526,0.041
qda,Quadratic Discriminant Analysis,0.7453,0.8019,0.7644,0.7551,0.7587,0.4889,0.4905,0.012
dt,Decision Tree Classifier,0.7436,0.743,0.7542,0.7617,0.756,0.4857,0.4886,0.015
ada,Ada Boost Classifier,0.7408,0.8217,0.7458,0.7634,0.7522,0.4804,0.4837,0.053
nb,Naive Bayes,0.739,0.8008,0.7085,0.7789,0.74,0.4793,0.4839,0.017


In [19]:
et_model = create_model('et')
evaluate_model(et_model)

Unnamed: 0,Parameters
bootstrap,False
ccp_alpha,0.0
class_weight,
criterion,gini
max_depth,
max_features,auto
max_leaf_nodes,
max_samples,
min_impurity_decrease,0.0
min_impurity_split,


In [20]:
predict_model(et_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8042,0.8945,0.7887,0.8462,0.8164,0.6071,0.6089


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Label,Score
0,-2.706462,0.010633,-0.429436,-0.195550,-2.791245,0.326027,1.508685,-2.448639,1.019987,1.347832,1.843980,Good,Good,0.78
1,0.466025,-1.659935,0.962406,-0.195550,0.670013,0.696846,-0.399400,-1.013998,-0.307747,-0.564099,0.941140,Good,Good,0.81
2,0.570909,0.605422,-1.265337,-0.649846,1.692985,1.181637,1.346695,0.899116,-0.994931,0.570440,-1.076594,Bad,Bad,0.82
3,0.988681,0.183954,0.918214,0.180841,-0.429410,0.326027,0.115215,1.260245,0.561882,0.003472,-1.076594,Bad,Bad,1.00
4,1.590187,-2.388119,1.179388,1.542798,0.932440,-1.535347,-0.731366,1.515651,-0.171999,1.677392,-0.129665,Good,Good,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,-1.634880,-2.140179,0.692758,-2.167319,-0.750600,1.833554,2.183508,-3.158095,-0.375860,-0.265100,1.320264,Good,Good,0.71
476,0.945865,0.010633,-0.372900,1.146996,0.113770,-0.788070,-0.347578,0.899116,-0.512529,-0.361419,0.297983,Good,Good,0.55
477,-0.464589,0.799460,-1.517498,-0.908179,-0.196595,-0.030173,0.115215,0.585753,1.344170,-0.564099,-1.076594,Bad,Bad,0.83
478,1.487314,0.799460,-0.097694,0.485768,0.295664,0.326027,0.568023,1.362633,-1.692915,0.684374,0.723708,Good,Good,0.72


In [21]:
save_model(et_model
           , model_name = 'extra_tree_model'
          )

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nod