# WINE QUALITY PREDICTION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## 1. READING FROM DATASET

In [2]:
df = pd.read_csv(r'C:\Users\vaish\Downloads\Wine Quality Detection\winequality-red.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.shape

(1599, 12)

In [4]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

## 2. SPLITTING OF DATASET

In [7]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
print(x.shape)
print(y.shape)

(1599, 11)
(1599,)


In [8]:
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [9]:
x_tr,x_te,y_tr,y_te = train_test_split(x,y,test_size=0.25)
print(x_tr.shape)
print(x_te.shape)
print(y_tr.shape)
print(y_te.shape)

(1199, 11)
(400, 11)
(1199,)
(400,)


## MODELING

### 1. Decision Tree Classifier

In [10]:
m1 = DecisionTreeClassifier(criterion='gini',max_depth=5,min_samples_split=10)
m1.fit(x_tr,y_tr)

DecisionTreeClassifier(max_depth=5, min_samples_split=10)

In [11]:
print('Training score',m1.score(x_tr,y_tr))
print('Testing score',m1.score(x_te,y_te))

Training score 0.6713928273561302
Testing score 0.58


In [12]:
ypred_m1 = m1.predict(x_te)
print(ypred_m1)

[5 7 6 5 6 6 4 7 5 5 5 5 6 5 6 5 6 6 7 6 6 5 5 5 6 6 5 5 6 6 6 5 6 7 5 7 6
 5 6 6 5 5 6 5 6 6 6 6 4 5 5 5 6 5 5 6 5 6 5 5 6 5 5 5 7 6 5 5 5 5 5 6 6 6
 6 5 5 6 5 5 6 7 5 6 5 6 6 5 5 6 5 6 5 5 5 4 6 5 5 6 6 7 6 5 5 7 6 5 5 5 5
 6 6 4 6 4 6 5 5 7 6 6 5 5 5 6 6 4 5 7 5 7 6 5 6 6 5 6 5 6 6 5 5 5 6 7 5 5
 6 5 5 6 7 6 5 5 6 6 6 5 6 5 6 7 5 5 5 5 5 5 5 6 6 5 6 6 5 6 5 5 6 7 5 5 5
 5 6 5 5 6 6 6 6 6 6 5 5 5 6 6 5 5 6 5 6 6 7 5 6 5 5 5 5 7 5 5 6 7 6 5 6 5
 7 5 5 6 6 6 5 6 6 6 5 6 7 5 5 5 7 6 6 5 6 5 6 5 7 6 5 5 5 6 6 5 6 6 6 6 7
 5 5 6 6 7 7 5 5 5 5 5 5 5 6 5 6 5 7 6 6 6 5 6 6 6 7 6 6 5 6 5 6 6 6 6 6 6
 6 6 6 6 5 5 7 5 7 5 7 7 5 6 6 6 7 6 6 6 6 5 6 5 6 5 6 7 6 7 5 5 7 6 5 5 6
 6 5 5 6 6 5 5 5 5 6 7 5 5 6 6 5 5 6 6 5 6 6 6 6 5 6 5 5 6 5 6 6 6 5 5 6 5
 7 6 5 5 7 5 6 7 5 6 5 6 5 5 5 6 6 5 6 5 5 5 5 6 6 6 6 5 5 6]


In [13]:
cm_m1 = confusion_matrix(y_te,ypred_m1)
print(cm_m1)
print(classification_report(y_te,ypred_m1))

[[  0   0   2   1   0   0]
 [  0   0   5   4   0   0]
 [  0   6 116  48   2   0]
 [  0   0  55  93  12   0]
 [  0   0   3  27  23   0]
 [  0   0   0   0   3   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9
           5       0.64      0.67      0.66       172
           6       0.54      0.58      0.56       160
           7       0.57      0.43      0.49        53
           8       0.00      0.00      0.00         3

    accuracy                           0.58       400
   macro avg       0.29      0.28      0.29       400
weighted avg       0.57      0.58      0.57       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest Classifier

In [14]:
m2 = RandomForestClassifier(n_estimators=50, criterion='gini',max_depth=5,
                            min_samples_split=10)
m2.fit(x_tr,y_tr)

RandomForestClassifier(max_depth=5, min_samples_split=10, n_estimators=50)

In [15]:
print('Training score',m2.score(x_tr,y_tr))
print('Testing score',m2.score(x_te,y_te))

Training score 0.6864053377814846
Testing score 0.6275


In [16]:
ypred_m2 = m2.predict(x_te)
print(ypred_m2)

[5 6 6 5 6 6 5 6 5 5 5 5 6 5 5 5 6 6 6 6 6 5 5 5 6 6 5 5 6 6 6 5 5 7 5 6 6
 5 6 5 5 6 6 5 6 6 6 6 5 5 6 5 6 6 5 6 5 5 6 5 6 5 5 5 6 6 5 5 5 5 5 6 6 6
 5 5 5 6 6 5 6 6 5 6 5 6 6 5 5 6 5 7 5 5 5 5 5 5 5 6 6 6 6 5 5 6 6 5 5 5 5
 6 6 5 6 5 5 5 5 6 5 6 5 6 5 6 6 5 5 6 5 6 5 5 6 6 5 6 5 6 5 5 5 5 6 6 5 5
 6 5 5 6 6 6 5 5 5 6 6 5 6 5 6 6 6 5 5 5 5 5 5 6 5 5 6 6 5 6 5 5 6 6 5 5 5
 5 5 6 5 6 6 6 6 6 5 5 5 5 6 5 5 6 6 5 6 6 7 6 5 5 5 5 5 6 5 5 5 7 6 5 6 5
 6 5 5 6 6 5 5 5 6 6 5 6 6 5 5 5 6 5 5 6 6 5 6 5 6 6 5 5 5 6 5 5 6 6 6 5 6
 5 5 6 6 6 6 5 5 6 5 5 5 6 6 5 6 5 6 6 6 6 5 5 6 6 6 6 6 5 6 5 6 5 6 6 6 6
 6 5 6 6 5 5 7 5 6 5 7 6 5 6 6 6 6 6 6 6 6 5 6 5 5 5 6 6 6 6 5 5 6 6 6 5 6
 6 5 5 5 6 5 5 6 5 6 7 5 5 6 6 5 5 6 6 6 6 6 6 5 5 6 5 5 6 5 5 6 6 5 5 6 5
 6 6 5 5 6 5 5 7 5 6 5 5 5 5 5 6 6 5 6 5 5 5 5 6 5 6 5 5 5 6]


In [17]:
cm_m2 = confusion_matrix(y_te,ypred_m2)
print(cm_m2)
print(classification_report(y_te,ypred_m2))

[[  0   0   3   0   0   0]
 [  0   0   6   3   0   0]
 [  0   0 140  32   0   0]
 [  0   0  53 105   2   0]
 [  0   0   3  44   6   0]
 [  0   0   0   3   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9
           5       0.68      0.81      0.74       172
           6       0.56      0.66      0.61       160
           7       0.75      0.11      0.20        53
           8       0.00      0.00      0.00         3

    accuracy                           0.63       400
   macro avg       0.33      0.26      0.26       400
weighted avg       0.62      0.63      0.59       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Logistic Regression

In [18]:
m3 = LogisticRegression(solver='liblinear')
m3.fit(x_tr,y_tr)

LogisticRegression(solver='liblinear')

In [19]:
print('Training score',m3.score(x_tr,y_tr))
print('Testing score',m3.score(x_te,y_te))

Training score 0.5829858215179317
Testing score 0.57


In [20]:
ypred_m3 = m3.predict(x_te)
print(ypred_m3)

[5 6 6 5 6 6 5 6 5 5 5 5 5 5 5 5 7 6 6 6 6 5 5 5 6 6 6 5 6 6 6 5 5 7 5 6 5
 6 6 5 5 6 6 5 5 6 5 5 5 5 6 5 6 5 5 6 5 6 5 5 6 5 5 5 6 5 5 5 5 5 5 6 6 6
 5 5 5 5 6 5 6 6 5 5 5 6 5 5 5 6 5 6 5 5 5 5 5 5 5 6 6 6 6 5 5 7 6 6 5 5 5
 6 6 5 6 5 5 5 5 6 5 6 5 5 5 6 6 5 5 6 6 6 5 5 6 5 5 6 5 6 5 6 5 5 6 6 5 5
 6 5 5 6 6 6 5 5 5 6 6 5 6 5 6 6 5 5 5 5 6 5 5 5 6 6 6 6 5 6 5 5 6 7 5 5 5
 6 5 5 5 5 5 6 6 6 6 5 5 5 6 6 6 6 6 5 5 6 6 6 6 5 5 5 5 6 5 5 5 6 6 6 6 6
 6 5 5 5 6 5 5 5 6 6 6 5 6 5 5 5 6 5 6 6 6 5 6 5 6 6 6 5 5 6 6 6 6 6 6 5 6
 5 6 6 6 6 6 5 5 6 5 5 5 6 6 5 6 5 6 6 6 6 5 6 5 6 6 6 5 6 6 5 6 5 6 5 6 6
 6 5 6 6 5 6 6 5 6 5 7 6 5 6 5 6 6 6 6 6 5 5 6 5 5 5 6 6 6 6 5 5 6 6 6 6 6
 6 5 5 5 6 5 6 6 5 6 6 5 5 6 6 5 5 5 6 5 6 6 6 5 5 6 5 5 6 6 5 5 6 5 5 6 5
 6 5 5 5 6 5 5 7 5 6 5 5 5 5 5 6 6 5 6 5 5 5 5 6 5 5 6 5 5 5]


In [21]:
cm_m3 = confusion_matrix(y_te,ypred_m3)
print(cm_m3)
print(classification_report(y_te,ypred_m3))

[[  0   0   3   0   0   0]
 [  0   0   7   2   0   0]
 [  0   0 132  39   1   0]
 [  0   0  63  94   3   0]
 [  0   0   3  48   2   0]
 [  0   0   0   3   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9
           5       0.63      0.77      0.69       172
           6       0.51      0.59      0.54       160
           7       0.33      0.04      0.07        53
           8       0.00      0.00      0.00         3

    accuracy                           0.57       400
   macro avg       0.25      0.23      0.22       400
weighted avg       0.52      0.57      0.53       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### KNN Classifier

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
x_tr.shape

(1199, 11)

In [24]:
m4 = KNeighborsClassifier(n_neighbors=27)
m4.fit(x_tr,y_tr)

KNeighborsClassifier(n_neighbors=27)

In [25]:
print('Training score',m4.score(x_tr,y_tr))
print('Testing score',m4.score(x_te,y_te))

Training score 0.5629691409507923
Testing score 0.545


In [26]:
ypred_m4 = m4.predict(x_te)
print(ypred_m4)

[5 5 6 5 6 6 5 6 6 5 5 5 5 5 6 5 7 6 6 6 6 5 5 5 6 6 5 6 6 6 6 5 5 7 5 7 6
 7 5 6 6 6 6 6 6 6 5 6 5 5 6 5 6 5 5 5 5 6 5 5 6 5 5 5 5 6 5 5 5 5 6 6 6 6
 5 6 5 5 6 5 6 6 5 6 5 6 6 5 6 5 5 6 5 6 5 5 6 5 5 6 6 6 5 5 5 5 6 5 5 5 5
 5 5 5 6 5 5 6 6 6 6 6 6 6 5 6 5 6 6 6 5 5 6 5 6 5 5 5 5 5 5 6 5 5 6 6 5 6
 6 5 5 6 5 6 5 5 5 6 6 5 6 5 5 6 5 5 5 5 5 6 6 6 6 5 6 6 5 6 6 5 5 5 5 5 5
 6 6 6 5 5 5 6 6 5 5 6 5 5 6 7 5 6 5 5 5 7 6 5 6 5 5 5 5 7 5 5 5 5 6 6 6 5
 6 5 6 5 6 5 5 6 6 6 6 5 6 6 6 5 6 5 5 6 6 5 5 5 6 5 6 5 5 6 5 5 6 6 6 6 6
 5 6 6 5 6 5 5 6 5 5 5 5 5 6 5 5 5 6 6 5 5 6 6 6 6 6 6 6 6 5 5 6 6 6 5 6 6
 5 5 6 6 6 6 6 5 5 5 6 6 5 6 6 6 6 7 6 6 6 5 6 6 5 5 5 5 6 5 5 5 7 6 5 5 5
 5 5 5 5 6 6 5 6 5 6 6 5 5 7 6 5 5 5 6 5 6 6 5 5 5 6 5 5 6 6 5 5 6 5 5 5 5
 6 6 5 6 6 5 6 6 5 6 5 5 5 5 5 6 6 5 6 6 5 5 5 5 5 5 5 5 5 5]


In [27]:
cm_m4 = confusion_matrix(y_te,ypred_m4)
print(cm_m4)
print(classification_report(y_te,ypred_m4))

[[  0   0   2   1   0   0]
 [  0   0   4   3   2   0]
 [  0   0 127  44   1   0]
 [  0   0  66  89   5   0]
 [  0   0  12  39   2   0]
 [  0   0   1   2   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9
           5       0.60      0.74      0.66       172
           6       0.50      0.56      0.53       160
           7       0.20      0.04      0.06        53
           8       0.00      0.00      0.00         3

    accuracy                           0.55       400
   macro avg       0.22      0.22      0.21       400
weighted avg       0.48      0.55      0.50       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### SVC

In [28]:
from sklearn.svm import SVC

In [29]:
m5 = SVC(kernel='linear',C=1)
m5.fit(x_tr,y_tr)

SVC(C=1, kernel='linear')

In [30]:
m6 = SVC(kernel='rbf',gamma=0.1)
m6.fit(x_tr,y_tr)

SVC(gamma=0.1)

In [31]:
m7 = SVC(kernel='poly')
m7.fit(x_tr,y_tr)

SVC(kernel='poly')

In [32]:
print('Training score',m5.score(x_tr,y_tr))
print('Testing score',m5.score(x_te,y_te))

Training score 0.5838198498748958
Testing score 0.5825


In [33]:
ypred_m5 = m5.predict(x_te)
print(ypred_m5)

[6 6 6 5 6 6 5 6 5 5 5 5 5 5 5 5 6 6 6 6 5 5 5 5 6 6 6 5 5 6 6 5 5 6 5 6 6
 6 6 5 5 5 6 5 5 6 6 5 5 5 5 5 6 5 5 6 5 6 5 5 6 5 5 5 6 5 5 5 5 5 5 6 6 6
 5 5 5 6 6 5 6 6 5 6 5 6 6 5 5 6 5 6 5 5 5 5 5 5 5 6 6 6 6 5 5 6 6 5 5 5 5
 6 6 5 6 5 5 5 5 6 5 5 5 5 5 6 6 5 5 6 5 6 5 5 6 5 5 6 5 6 5 5 5 5 6 6 5 5
 6 5 5 6 6 6 5 5 5 6 6 5 6 5 6 6 5 5 5 5 6 5 5 5 6 5 6 6 5 6 5 5 6 6 5 5 5
 5 5 5 5 5 5 6 6 6 6 5 5 5 6 5 5 6 6 5 6 6 6 6 5 5 5 5 5 6 5 5 5 6 6 5 6 5
 6 5 5 5 6 5 5 5 6 6 5 5 6 5 5 5 6 5 6 5 6 5 6 5 6 6 6 5 5 6 6 6 6 6 6 5 6
 5 5 6 6 6 6 5 5 6 5 5 5 5 6 5 5 5 6 6 6 6 5 6 6 6 6 6 5 5 6 5 6 5 6 5 6 6
 6 5 6 6 5 6 6 6 6 5 6 6 5 6 5 6 6 6 6 6 6 5 6 5 5 5 6 6 6 6 5 5 6 6 6 5 6
 6 5 5 5 6 5 6 5 5 6 6 5 5 6 6 5 5 6 6 5 6 6 6 5 5 6 5 5 6 5 5 5 6 5 5 6 5
 6 5 5 5 6 5 5 6 5 6 5 5 5 5 5 6 6 5 6 5 5 5 5 6 5 5 6 5 5 5]


In [34]:
cm_m5 = confusion_matrix(y_te,ypred_m5)
print(cm_m5)
print(classification_report(y_te,ypred_m5))

[[  0   0   3   0   0   0]
 [  0   0   7   2   0   0]
 [  0   0 140  32   0   0]
 [  0   0  67  93   0   0]
 [  0   0   4  49   0   0]
 [  0   0   0   3   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9
           5       0.63      0.81      0.71       172
           6       0.52      0.58      0.55       160
           7       0.00      0.00      0.00        53
           8       0.00      0.00      0.00         3

    accuracy                           0.58       400
   macro avg       0.19      0.23      0.21       400
weighted avg       0.48      0.58      0.53       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Comparison and Final Conclusion

In [35]:
print('DT Acc',m1.score(x_te,y_te))
print('RF Acc',m2.score(x_te,y_te))
print('LogReg Acc',m3.score(x_te,y_te))
print('KNN Acc',m4.score(x_te,y_te))
print('SVM Acc',m5.score(x_te,y_te))

DT Acc 0.58
RF Acc 0.6275
LogReg Acc 0.57
KNN Acc 0.545
SVM Acc 0.5825


In [36]:
from sklearn.metrics import precision_score,accuracy_score

In [37]:
print('DT Precision',precision_score(y_te,ypred_m1,average='macro'))
print('RF Precision',precision_score(y_te,ypred_m2,average='macro'))
print('LogReg Precision',precision_score(y_te,ypred_m3,average='macro'))
print('KNN Precision',precision_score(y_te,ypred_m4,average='macro'))
print('SVM Precision',precision_score(y_te,ypred_m5,average='macro'))

DT Precision 0.2922427053726354
RF Precision 0.3324040259119169
LogReg Precision 0.24555417700578988
KNN Precision 0.21650943396226416
SVM Precision 0.19217287258693763


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
print(classification_report(y_te,ypred_m5))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9
           5       0.63      0.81      0.71       172
           6       0.52      0.58      0.55       160
           7       0.00      0.00      0.00        53
           8       0.00      0.00      0.00         3

    accuracy                           0.58       400
   macro avg       0.19      0.23      0.21       400
weighted avg       0.48      0.58      0.53       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
