# Red Wine Quality Prediction
 By: Sanket Sharma, Email:ur.sanketsharma@gmail.com, Linkedin: https://www.linkedin.com/in/ursanketsharma

In [20]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sklearn.metrics as matrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
# load dataset
df = pd.read_csv("Data/winequality-red.csv")
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [22]:
# check info about dataset
print(df.info())
print("Missing Values\n")
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None
Missing Values

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
c

All features are numeric, No missing values

In [23]:
# check numbers of each class in quality Column
print(df.value_counts("quality"))

quality
5    681
6    638
7    199
4     53
8     18
3     10
dtype: int64


In [24]:
# Lets make them 0:bad, 1:Average, 2: Good
df['quality'] = [2 if x >= 7 else 1 if x>=5 else 0 for x in df['quality']]

In [25]:
df.value_counts('quality')

quality
1    1319
2     217
0      63
dtype: int64

In [26]:
# input fcolumns vs output column
X = df.drop('quality', axis=1)
y = df['quality']

In [27]:
# Train test split
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, test_size=0.2)

In [28]:
y_train.shape

(1279,)

In [29]:
# Lets make a Dictionary of all Models    

In [30]:
Models = {"DecisionTree": DecisionTreeClassifier(random_state=seed), 
         "RandomForest": RandomForestClassifier(random_state=seed),
        "ExtraTrees": ExtraTreesClassifier(random_state=seed),
        "AdaBoost": AdaBoostClassifier(random_state=seed),
        "GBC": GradientBoostingClassifier(random_state=seed),
        "GNB": GaussianNB()}

In [31]:
# Lets Fit each model in Models dictionary and get their scor4es in scores list

In [32]:
scores= []
for name, model in Models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy_score = matrics.accuracy_score(y_test, pred)
    print("Model Name: "+str(model)+"\n")
    print("F1_score: "+str(matrics.f1_score(y_test, pred, average='micro'))+"\n")
    print("Accuray:"+str(matrics.accuracy_score(y_test, pred))+"\n")
    print("Confusin Matrix: \n")
    print(matrics.confusion_matrix(y_test, pred))
    scores.append({
        'model': name,
        'accuracy_score': accuracy_score
    })
    print("\n*****************************\n")

Model Name: DecisionTreeClassifier(random_state=42)

F1_score: 0.784375

Accuray:0.784375

Confusin Matrix: 

[[  1  10   0]
 [ 13 222  27]
 [  1  18  28]]

*****************************

Model Name: RandomForestClassifier(random_state=42)

F1_score: 0.865625

Accuray:0.865625

Confusin Matrix: 

[[  0  11   0]
 [  0 251  11]
 [  0  21  26]]

*****************************

Model Name: ExtraTreesClassifier(random_state=42)

F1_score: 0.8843749999999999

Accuray:0.884375

Confusin Matrix: 

[[  0  11   0]
 [  0 257   5]
 [  0  21  26]]

*****************************

Model Name: AdaBoostClassifier(random_state=42)

F1_score: 0.7281249999999999

Accuray:0.728125

Confusin Matrix: 

[[  2   9   0]
 [ 28 205  29]
 [  0  21  26]]

*****************************

Model Name: GradientBoostingClassifier(random_state=42)

F1_score: 0.85

Accuray:0.85

Confusin Matrix: 

[[  1  10   0]
 [  3 249  10]
 [  0  25  22]]

*****************************

Model Name: GaussianNB()

F1_score: 0.815625

Accu

In [33]:
# Lets arrange theese scores in descending order to find the best Model
scores_df = pd.DataFrame(scores)
scores_df.sort_values(by='accuracy_score', ascending=False)

Unnamed: 0,model,accuracy_score
2,ExtraTrees,0.884375
1,RandomForest,0.865625
4,GBC,0.85
5,GNB,0.815625
0,DecisionTree,0.784375
3,AdaBoost,0.728125


    ExtraTrees is best Model here by accuracy.

In [34]:
# lets use extartrees as final model to make prediction 
final_model = ExtraTreesClassifier(random_state=seed)
classifier = final_model.fit(X_train, y_train)

In [35]:
# Lets create a pickle file of this model

import pickle
pickle_out = open("PKL/classifier.pkl", "wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()

In [36]:
pickle_in =open('PKL/classifier.pkl', 'rb')
classifier = pickle.load(pickle_in)

In [37]:
prediction=classifier.predict([[11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8]])
print(prediction)

[1]


In [38]:
pickle_in.close()