In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!pip install pycaret --quiet

[K     |████████████████████████████████| 288 kB 5.2 MB/s 
[K     |████████████████████████████████| 261 kB 51.6 MB/s 
[K     |████████████████████████████████| 167 kB 61.2 MB/s 
[K     |████████████████████████████████| 56 kB 4.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 57.9 MB/s 
[K     |████████████████████████████████| 113 kB 57.3 MB/s 
[K     |████████████████████████████████| 2.0 MB 29.5 MB/s 
[K     |████████████████████████████████| 1.7 MB 65.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 15.5 MB 636 kB/s 
[K     |████████████████████████████████| 6.8 MB 49.2 MB/s 
[K     |████████████████████████████████| 86 kB 4.9 MB/s 
[K     |████████████████████████████████| 62 kB 715 kB/s 
[K     |████████████████████████████████| 596 kB 44

In [None]:
import os
import pandas as pd
import numpy as np

from pycaret.classification import *
from time import time
from sklearn.preprocessing import StandardScaler

os.chdir('/gdrive/MyDrive/Dacon/Wine/')

In [None]:
dir = os.getcwd() + '/Dataset/'

train = pd.read_csv(dir + 'train.csv')
test  = pd.read_csv(dir + 'test.csv')

In [None]:
class Pipeline:
    def __init__(self, data):
        self.data = data

    def drop_cols(self):
        self.data = self.data.drop(['id'], axis=1)

    def onehot(self):
        mapping = {
            'red' : 1, 'white': 0
        }
        self.data['type'] = self.data['type'].map(mapping) 

    def logarithmic(self):
        cols = ['fixed acidity', 'chlorides', 'sulphates', 'residual sugar', 'volatile acidity']
        self.data.loc[:,cols] = self.data.loc[:, cols].apply(np.log)

    def scaler(self):
        try:
            transform_data = self.data.drop(columns = ['type', 'quality'])
        except:
            transform_data = self.data.drop(columns = ['type'])

        scaler = StandardScaler()

        std_transform_data = scaler.fit_transform(transform_data)

        self.data[transform_data.columns] = std_transform_data

    def fit(self):
        Pipeline.drop_cols(self)
        Pipeline.logarithmic(self)
        Pipeline.scaler(self)
        Pipeline.onehot(self)
        
        return self.data

In [None]:
train = Pipeline(train).fit()

In [None]:
test = Pipeline(test).fit()

In [None]:
clf = setup(train, target = 'quality', 
            remove_outliers=True, outliers_threshold=0.05,#outlier threshold : 양 끝단 0.025 (2.5%)
            fold=10, fold_shuffle=True, n_jobs=-1,
            feature_selection=True)

Unnamed: 0,Description,Value
0,session_id,8037
1,Target,quality
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(3231, 13)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
top5_models = compare_models(fold = 5, round = 3, sort = 'Accuracy', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.62,0.804,0.407,0.617,0.603,0.401,0.406,0.652
rf,Random Forest Classifier,0.615,0.794,0.397,0.619,0.596,0.391,0.397,0.796
lightgbm,Light Gradient Boosting Machine,0.589,0.77,0.394,0.586,0.575,0.355,0.359,0.526
gbc,Gradient Boosting Classifier,0.579,0.758,0.381,0.569,0.563,0.338,0.342,1.814
lda,Linear Discriminant Analysis,0.564,0.728,0.342,0.534,0.543,0.311,0.315,0.026
lr,Logistic Regression,0.561,0.729,0.327,0.531,0.535,0.296,0.303,0.218
qda,Quadratic Discriminant Analysis,0.553,0.726,0.367,0.534,0.541,0.318,0.319,0.026
ridge,Ridge Classifier,0.549,0.0,0.296,0.515,0.502,0.261,0.274,0.022
knn,K Neighbors Classifier,0.546,0.699,0.344,0.524,0.531,0.294,0.295,0.152
dt,Decision Tree Classifier,0.522,0.642,0.394,0.523,0.522,0.285,0.285,0.032


In [None]:
models = {}
model_name = ['et', 'rf', 'lightgbm', 'gbc', 'lda']

for name in model_name:
    m = create_model(name, fold=5)
    models[name] = tune_model(m, fold=5, optimize='Accuracy', choose_better=True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.5581,0.7303,0.36,0.5442,0.5424,0.3045,0.3078
1,0.5744,0.7347,0.3369,0.5341,0.5498,0.3253,0.3301
2,0.593,0.7426,0.3615,0.5557,0.5709,0.3584,0.3621
3,0.5478,0.719,0.3351,0.5229,0.5282,0.287,0.29
4,0.5548,0.7156,0.3329,0.5225,0.5337,0.2923,0.2961
Mean,0.5656,0.7284,0.3453,0.5359,0.545,0.3135,0.3172
SD,0.0162,0.0099,0.0127,0.0127,0.0149,0.026,0.0263


In [None]:
model_voting = blend_models(estimator_list = list(models.values()),
                            method = 'soft', optimize='Accuracy')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6512,0.8242,0.4476,0.6415,0.636,0.4501,0.456
1,0.6512,0.7938,0.4204,0.6265,0.6322,0.4538,0.459
2,0.6698,0.8383,0.4275,0.6639,0.6495,0.4802,0.4856
3,0.6326,0.8192,0.4315,0.614,0.6129,0.4222,0.4273
4,0.6465,0.8267,0.429,0.6184,0.6312,0.4554,0.4566
5,0.6512,0.8229,0.441,0.6556,0.6365,0.4523,0.4563
6,0.614,0.8018,0.3568,0.5718,0.5878,0.3871,0.3926
7,0.6465,0.8109,0.4712,0.6637,0.6367,0.4434,0.4484
8,0.5981,0.7799,0.3942,0.5912,0.5849,0.363,0.3698
9,0.6075,0.7821,0.3681,0.5743,0.5864,0.3772,0.3816


In [None]:
evaluate_model(model_voting)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [None]:
final_model = finalize_model(model_voting)

In [None]:
prediction = predict_model(final_model, raw_score=True)
prediction.head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,residual sugar,volatile acidity,free sulfur dioxide,chlorides,total sulfur dioxide,density,alcohol,sulphates,pH,citric acid,quality,Label,Score_0,Score_1,Score_2,Score_3,Score_4
0,-1.220969,-0.547957,0.138991,-0.147021,-0.183973,-0.546035,0.253757,0.267397,0.036047,-0.065113,6,6,0.003,0.1023,0.8204,0.0674,0.0069
1,0.612229,0.384465,0.311559,-0.701831,1.466238,-0.087508,-0.500246,0.73447,-0.272884,-1.093696,6,6,0.0105,0.2136,0.7593,0.0147,0.0019
2,-1.779356,-1.20952,-0.32119,-1.038531,-0.686211,-1.187973,0.505092,-1.174149,-1.385036,-0.270829,5,5,0.0065,0.6525,0.266,0.064,0.011
3,-0.081903,0.626756,-0.493758,-0.196897,0.551447,-1.449988,1.259095,-1.967928,-0.396457,-0.065113,7,7,0.0158,0.0735,0.1926,0.7094,0.0086
4,1.102272,-0.454081,1.116876,-0.050375,0.156831,0.341542,-0.33269,-0.514965,-0.087526,-0.682263,6,6,0.0013,0.1444,0.8181,0.0297,0.0065


In [None]:
pred = predict_model(final_model,data=test, raw_score=True)
pred

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,Label,Score_0,Score_1,Score_2,Score_3,Score_4
0,0.154583,0.008212,-0.470589,0.660311,-1.061068,-0.022119,0.403475,-0.775939,-2.026277,-1.271218,0.515259,0,6,0.0150,0.2720,0.5355,0.1576,0.0199
1,1.222912,1.857632,-0.539777,-0.480735,3.507027,-1.470535,-1.770665,1.489262,0.613957,0.604571,-1.248324,1,6,0.0188,0.2321,0.7454,0.0036,0.0001
2,-0.188557,1.618401,-2.131099,-0.528346,0.960311,-0.949105,-1.346442,0.169595,0.739683,0.123223,0.263319,1,6,0.0569,0.2621,0.6293,0.0497,0.0020
3,-0.742496,-0.669546,1.259108,1.181878,-0.370609,1.773918,0.297419,0.237619,-2.277728,-0.426409,-1.164343,0,6,0.0139,0.2683,0.6610,0.0482,0.0086
4,-0.188557,1.268386,-1.923535,-0.268089,0.990859,-1.354662,-1.558554,1.353214,2.939878,2.220729,-0.912403,1,6,0.0346,0.2804,0.6643,0.0200,0.0008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3226,-0.014571,0.080300,-0.539777,1.451074,-0.018157,1.252488,1.570086,1.115130,-0.769023,-0.181262,-1.416284,0,5,0.0270,0.7818,0.1699,0.0186,0.0027
3227,-1.459692,-0.222896,0.567229,0.559624,-0.919397,0.325501,0.049956,-1.116059,-0.077533,-0.688094,1.355061,0,7,0.0049,0.0291,0.3575,0.5442,0.0643
3228,0.154583,0.008212,2.573679,1.159231,-0.481909,1.600108,0.792345,1.013094,-1.020474,-0.511691,-1.584244,0,5,0.0049,0.9004,0.0898,0.0042,0.0007
3229,-1.910945,0.217954,0.359666,0.578589,-1.135261,0.673121,0.315095,-1.347341,0.173918,-0.872900,1.690981,0,7,0.0010,0.0164,0.3112,0.5712,0.1001


In [None]:
sub = pd.read_csv(dir + 'sample_submission.csv')
sub['quality'] = pred['Label']
sub.to_csv(dir + 'sample_submission_automl_4.csv', index=False)