# Wine Quality Prediction Model

In [52]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sys
import warnings
warnings.filterwarnings("ignore")
sys.path.append('../scripts')

from data_load import data_reading
from data_load import feature_generation

In [53]:
train_df, test_df = data_reading()

In [54]:
train_df = feature_generation(train_df)
test_df = feature_generation(test_df)

In [55]:
if train_df.isna().any().any():
    print('dropped missing values in train_df.')
    train_df = train_df.dropna()

dropped missing values in train_df.


In [56]:
X = train_df.drop('quality', axis = 1)
y = train_df['quality']

In [57]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=2019, stratify = y_encoded)

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import xgboost as xgb

## Model 1: Logistic Regression

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_tr_pred = model.predict(X_train)
y_te_pred = model.predict(X_test)

In [24]:
train_acc = accuracy_score(y_train, y_tr_pred)
test_acc = accuracy_score(y_test, y_te_pred)
print(f'LogisticRegression Accuracy - Train: {train_acc:#.2f}, Test: {test_acc:#.2f}')

train_qwk = cohen_kappa_score(y_train, y_tr_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_te_pred, weights='quadratic')
print(f'LogisticRegression QWK - Train: {train_qwk:#.2f}, Test: {test_qwk:#.2f}')

LogisticRegression Accuracy - Train: 0.58, Test: 0.55
LogisticRegression QWK - Train: 0.46, Test: 0.41


## Model 2: KNN

In [9]:
# Creating a scorer that maximizes Quadratic Weighted Kappa (QWK)
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

qwk_scorer = make_scorer(quadratic_weighted_kappa)

In [26]:
params = {'n_neighbors': range(1,13,2)}
model = GridSearchCV(estimator = KNeighborsClassifier(), 
                     param_grid = params,
                     cv = 5,
                     scoring=qwk_scorer)

model.fit(X_train, y_train)
best_model = model.best_estimator_
print(model.best_params_)
print(model.best_score_)

y_tr_pred = best_model.predict(X_train)
y_te_pred = best_model.predict(X_test)

{'n_neighbors': 9}
0.26152574974887133


In [27]:
train_acc = accuracy_score(y_train, y_tr_pred)
test_acc = accuracy_score(y_test, y_te_pred)
print(f'KNeighborsClassifier Accuracy - Train: {train_acc:#.2f}, Test: {test_acc:#.2f}')

train_qwk = cohen_kappa_score(y_train, y_tr_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_te_pred, weights='quadratic')
print(f'KNeighborsClassifier QWK - Train: {train_qwk:#.2f}, Test: {test_qwk:#.2f}')

KNeighborsClassifier Accuracy - Train: 0.59, Test: 0.49
KNeighborsClassifier QWK - Train: 0.35, Test: 0.20


## Model 3: Random Forest

In [28]:
# Best parameters after first run
# {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
params = {'n_estimators': [100, 200], 
          'max_depth': [5, 10], 
          'min_samples_split': [2, 5], 
          'min_samples_leaf': [1, 2], 
          'max_features': ['sqrt', 'log2'],
          'bootstrap': [True, False]}

model = GridSearchCV(estimator = RandomForestClassifier(), 
                     param_grid = params,
                     cv = 5, 
                     verbose = True,
                     scoring=qwk_scorer)
                     
model.fit(X_train, y_train)
best_model = model.best_estimator_
print(model.best_params_)
print(model.best_score_)

y_tr_pred = best_model.predict(X_train)
y_te_pred = best_model.predict(X_test)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
0.4907716429914262


In [29]:
train_acc = accuracy_score(y_train, y_tr_pred)
test_acc = accuracy_score(y_test, y_te_pred)
print(f'RandomForestClassifier Accuracy - Train: {train_acc:#.2f}, Test: {test_acc:#.2f}')

train_qwk = cohen_kappa_score(y_train, y_tr_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_te_pred, weights='quadratic')
print(f'RandomForestClassifier QWK - Train: {train_qwk:#.2f}, Test: {test_qwk:#.2f}')

RandomForestClassifier Accuracy - Train: 0.91, Test: 0.58
RandomForestClassifier QWK - Train: 0.86, Test: 0.47


## Model 4: XGBoost

In [30]:
# Best parameters at first run
# {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}
params = {'n_estimators': [100, 200], 
          'max_depth': [5, 10], 
          'learning_rate': [0.1, 0.01],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9],
          'reg_alpha': [0, 0.1],
          'reg_lambda': [0, 0.1]}

model = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax'), 
                     param_grid=params, 
                     cv=5, 
                     scoring=qwk_scorer, 
                     verbose=True)

model.fit(X_train, y_train)
best_model = model.best_estimator_
print(model.best_params_)

y_tr_pred = best_model.predict(X_train)
y_te_pred = best_model.predict(X_test)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}


In [31]:
train_acc = accuracy_score(y_train, y_tr_pred)
test_acc = accuracy_score(y_test, y_te_pred)
print(f'XGBClassifier Accuracy - Train: {train_acc:#.2f}, Test: {test_acc:#.2f}')

train_qwk = cohen_kappa_score(y_train, y_tr_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_te_pred, weights='quadratic')
print(f'XGBClassifier QWK - Train: {train_qwk:#.2f}, Test: {test_qwk:#.2f}')

XGBClassifier Accuracy - Train: 1.00, Test: 0.57
XGBClassifier QWK - Train: 1.00, Test: 0.47


## Model 5: XGBoost with Stratified KFold CV

In [27]:
# Best parameters after first run
# {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}
params = {'n_estimators': [100, 200], 
          'max_depth': [5, 10], 
          'learning_rate': [0.1, 0.01],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9],
          'reg_alpha': [0, 0.1],
          'reg_lambda': [0, 0.1]}

model = xgb.XGBClassifier(objective='multi:softmax')

skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
grid_search = GridSearchCV(estimator=model, 
                           param_grid=params, 
                           cv=skf, 
                           scoring=qwk_scorer, 
                           verbose=True)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)

y_tr_pred = best_model.predict(X_train)
y_te_pred = best_model.predict(X_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}


In [28]:
train_acc = accuracy_score(y_train, y_tr_pred)
test_acc = accuracy_score(y_test, y_te_pred)
print(f'XGBClassifier Accuracy - Train: {train_acc:#.2f}, Test: {test_acc:#.2f}')

train_qwk = cohen_kappa_score(y_train, y_tr_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_te_pred, weights='quadratic')
print(f'XGBClassifier QWK - Train: {train_qwk:#.2f}, Test: {test_qwk:#.2f}')

XGBClassifier Accuracy - Train: 1.00, Test: 0.57
XGBClassifier QWK - Train: 1.00, Test: 0.47


In [60]:
test_df['predictions'] = best_model.predict(test_df)
test_df['predictions'] = label_encoder.inverse_transform(test_df['predictions'])
test_df['predictions'].value_counts(dropna = False)

6    621
5    592
7    155
8      2
4      2
Name: predictions, dtype: int64

In [61]:
test_df['predictions'].dtypes

dtype('int64')

In [62]:
df = pd.read_csv('../../data/test.csv')
df.drop([col for col in df.columns if col != 'Id'], axis = 1, inplace = True)
df['quality'] = test_df['predictions'].astype(int)

Unnamed: 0,Id,quality
0,2056,6
1,2057,7
2,2058,6
3,2059,6
4,2060,6
...,...,...
1367,3423,5
1368,3424,6
1369,3425,5
1370,3426,5


In [63]:
df.to_csv('../../data/submission_file.csv', index=False)

In [64]:
df.shape

(1372, 2)