# We need to predict the quality of the Wine

In [39]:
import pandas as pd

## Data Injestion

In [40]:
df = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [41]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## EDA

In [42]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [43]:
df.quality.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [44]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

### we can see there are six classes (six different values for quality to be predicted)
### This is a multi class classification problem

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


### No missing data (no handling required)

In [46]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


### Standardization

##### We are performing standardization because the data of different columns have different scale (some have data as 0.06 and some column have data as 35.00)
#### We need to standardize the data

In [47]:
from sklearn.preprocessing import StandardScaler

In [57]:
X = df.drop("quality", axis = 1)

In [58]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [62]:
y = df['quality']
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

### Splitting the data as train and test data 

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.35, random_state=43)

In [64]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
487,10.2,0.645,0.36,1.8,0.053,5.0,14.0,0.9982,3.17,0.42,10.0
489,9.3,0.39,0.4,2.6,0.073,10.0,26.0,0.9984,3.34,0.75,10.2
667,11.3,0.34,0.45,2.0,0.082,6.0,15.0,0.9988,2.94,0.66,9.2
237,7.2,0.645,0.0,1.9,0.097,15.0,39.0,0.99675,3.37,0.58,9.2
286,12.0,0.45,0.55,2.0,0.073,25.0,49.0,0.9997,3.1,0.76,10.3


In [65]:
Y_train.head()

487    6
489    6
667    6
237    6
286    6
Name: quality, dtype: int64

In [69]:
scaler = StandardScaler()
scaler.fit(X_train)  ## calculate te mean and std Deviation
print(scaler.mean_)

[ 8.29634264  0.53381136  0.26367661  2.54643888  0.08666218 16.00529355
 46.96631376  0.99679377  3.31201155  0.65673725 10.38477703]


In [76]:
X_train_transform = scaler.transform(X_train)

In [77]:
### scaler.fit_transform(X_train)  ## Same as above

In [82]:
X_train_transform

array([[ 1.09950645,  0.6234578 ,  0.49683544, ..., -0.91174157,
        -1.36147946, -0.36855433],
       [ 0.57968822, -0.80638013,  0.7031552 , ...,  0.17969126,
         0.53635549, -0.17698659],
       [ 1.73483984, -1.08674051,  0.96105489, ..., -2.38838598,
         0.01876414, -1.1348253 ],
       ...,
       [ 1.850355  , -1.98389373,  1.27053452, ..., -0.20552033,
         1.80157879, -0.27277046],
       [-0.17116034,  0.20291723, -0.17370376, ..., -0.33392419,
        -0.49882721, -1.03904143],
       [ 1.850355  , -0.63816391,  1.11579471, ..., -1.42535702,
        -0.72886781,  0.58928438]])

## SVC

In [83]:
from sklearn.svm import SVC

In [84]:
model = SVC()

In [86]:
model.fit(X_train_transform,Y_train)

SVC()

#### Model is trained.

In [88]:
model.score(X_train_transform,Y_train)

0.6717998075072185

### Model training accuracy is 67 %

## Check accuracy with respect to test

In [93]:
X_test_transform = scaler.transform(X_test)

In [98]:
Y_predict = model.predict(X_test_transform)   ## Predicting data
Y_predict

array([6, 5, 5, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 6, 6, 5, 6, 6, 6, 5, 5,
       6, 6, 5, 5, 6, 6, 6, 7, 6, 5, 5, 6, 6, 5, 7, 6, 6, 6, 5, 5, 6, 6,
       5, 6, 5, 6, 6, 5, 6, 5, 5, 6, 6, 5, 5, 6, 6, 5, 6, 6, 5, 6, 5, 7,
       6, 5, 6, 5, 6, 5, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 5, 6, 6, 6, 5, 5,
       5, 5, 7, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6,
       5, 6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 7, 5, 6, 7, 5, 5, 5, 5, 6, 6,
       6, 5, 5, 5, 5, 6, 7, 6, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6,
       6, 6, 6, 5, 6, 6, 5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 7, 6, 6, 5, 5, 5,
       6, 6, 6, 6, 6, 6, 6, 5, 6, 7, 6, 5, 6, 6, 5, 5, 6, 5, 6, 6, 5, 6,
       5, 6, 7, 6, 5, 5, 5, 5, 6, 5, 5, 6, 6, 6, 5, 6, 5, 7, 5, 5, 5, 7,
       6, 5, 5, 6, 5, 5, 5, 7, 5, 5, 5, 5, 6, 6, 5, 6, 6, 6, 5, 5, 5, 6,
       6, 5, 6, 5, 5, 5, 6, 5, 5, 5, 7, 5, 6, 6, 6, 5, 6, 5, 5, 6, 5, 7,
       5, 6, 6, 7, 5, 5, 5, 5, 6, 6, 6, 6, 5, 6, 5, 6, 6, 5, 6, 5, 6, 5,
       5, 6, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5,

In [95]:
### Comparing above predicted data with actual data

In [96]:
Y_test

547     6
1197    6
500     6
631     5
1128    5
       ..
1005    7
779     5
536     5
726     6
573     4
Name: quality, Length: 560, dtype: int64

In [97]:
from sklearn.metrics import accuracy_score

In [99]:
accuracy_score(Y_test,Y_predict)

0.6303571428571428

#### We are getting accuracy 63 % for test data