# Wine Quality prediction

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Importing Dataset

In [2]:
df = pd.read_csv('WineQT.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


### Droping the 'id' column

In [3]:
df.drop(['Id'], axis=1, inplace=True)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Check for missing values
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 107.3 KB


### Checking if the data is balanced

In [6]:
print(df['quality'].unique())
print(df['quality'].value_counts())

[5 6 7 4 8 3]
5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64


###### We can see that 5 and 6 occur more than 80% of the times. This means our data is highly imbalanced and if we train on this data, our model will be highly biased. We will utilize oversampling to overcome this challenge

### Let's scale our data, it will be easy for model to train

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc_df = sc.fit_transform(df.drop(['quality'], axis=1))
sc_df = pd.DataFrame(sc_df, columns=df.columns[:-1])

In [8]:
sc_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.52158,0.939332,-1.365027,-0.466421,-0.231395,-0.450467,-0.36361,0.555854,1.270695,-0.573658,-0.963382
1,-0.292593,1.941813,-1.365027,0.05006,0.234247,0.91592,0.643477,0.036165,-0.708928,0.130881,-0.593601
2,-0.292593,1.273492,-1.161568,-0.171289,0.107253,-0.060071,0.246745,0.140103,-0.325775,-0.045254,-0.593601
3,1.653789,-1.399789,1.4834,-0.466421,-0.25256,0.135127,0.429852,0.659792,-0.964363,-0.456235,-0.593601
4,-0.52158,0.939332,-1.365027,-0.466421,-0.231395,-0.450467,-0.36361,0.555854,1.270695,-0.573658,-0.963382


In [9]:
X = sc_df
y = df['quality']

### For oversampling I will use imblearn library

In [10]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X, y = sm.fit_resample(X, y)

In [11]:
print(X.shape, y.shape)

(2898, 11) (2898,)


In [12]:
print(y.value_counts())

7    483
5    483
3    483
8    483
6    483
4    483
Name: quality, dtype: int64


### Train Test Split

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [15]:
# To try bunch of models, lets create a method so that we can try all of them
def eval_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print("-"*30)
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

### Model 1: Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
eval_model(LogisticRegression())

[[109   7   0   0   0   0]
 [  6  58  20  21   3   0]
 [  6  24  61  27   4   4]
 [  4  18  24  38  30  19]
 [  1   2   1  23  68  31]
 [  0   0   0   0  11 105]]
------------------------------
Accuracy Score: 0.6055172413793104


### Model 2: KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
eval_model(KNeighborsClassifier())

[[116   0   0   0   0   0]
 [  0 104   2   2   0   0]
 [  2  23  57  31  11   2]
 [  0  17  26  47  28  15]
 [  0   1   0   4 115   6]
 [  0   0   0   0   1 115]]
------------------------------
Accuracy Score: 0.7641379310344828


### Model 3: SVM

In [18]:
from sklearn.svm import SVC
eval_model(SVC())

[[116   0   0   0   0   0]
 [  3  90   8   6   1   0]
 [  3  15  76  27   4   1]
 [  1  13  32  52  25  10]
 [  0   0   0  29  90   7]
 [  0   0   0   0   0 116]]
------------------------------
Accuracy Score: 0.7448275862068966


### Model 4: Kernel SVM

In [19]:
from sklearn.svm import SVC
eval_model(SVC(kernel='rbf'))

[[116   0   0   0   0   0]
 [  3  90   8   6   1   0]
 [  3  15  76  27   4   1]
 [  1  13  32  52  25  10]
 [  0   0   0  29  90   7]
 [  0   0   0   0   0 116]]
------------------------------
Accuracy Score: 0.7448275862068966


### Model 5: Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB
eval_model(GaussianNB())

[[110   6   0   0   0   0]
 [ 37  42  14  11   4   0]
 [ 18  24  58  10  11   5]
 [ 14  27  16  25  26  25]
 [  1  11   1  10  48  55]
 [  0   0   0   6  11  99]]
------------------------------
Accuracy Score: 0.526896551724138


### Model 6: Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier
eval_model(DecisionTreeClassifier())

[[113   1   0   2   0   0]
 [  0  82  15   8   3   0]
 [  0   8  77  30  10   1]
 [  0  11  32  61  24   5]
 [  0   0   2  13 104   7]
 [  0   0   1   2   3 110]]
------------------------------
Accuracy Score: 0.7544827586206897


### Model 7: Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier
eval_model(RandomForestClassifier())

[[116   0   0   0   0   0]
 [  0 100   7   1   0   0]
 [  0   8  86  29   3   0]
 [  0   6  32  67  25   3]
 [  0   1   0   8 115   2]
 [  0   0   0   0   1 115]]
------------------------------
Accuracy Score: 0.8262068965517242


### Model 8: XGBoost

In [23]:
from xgboost import XGBClassifier
eval_model(XGBClassifier())





[[116   0   0   0   0   0]
 [  0 101   6   1   0   0]
 [  1   5  88  30   1   1]
 [  0   4  29  71  26   3]
 [  0   1   0  11 112   2]
 [  0   0   0   0   1 115]]
------------------------------
Accuracy Score: 0.8317241379310345


#### Conclusion: Random Forest Regression and XGboost is the best model to predict wine quality

#### Note: These models can be improved for better results