In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import norm, skew
import random

In [4]:
wineQT = pd.read_csv("../input/wine-quality-dataset/WineQT.csv")
wineQT.head()

# Data Cleaning, Outlier Handling

In [5]:
wineQT.describe()

In [6]:
del wineQT['Id']
wineQT.head()

In [7]:

wineQT.isnull().sum()

In [8]:
from scipy import stats
z = np.abs(stats.zscore(wineQT))

threshold = 3

#Keep rows with Z-score less than 3
wineQT = wineQT[(z < 3).all(axis=1)]
wineQT.shape

# EDA, Data Visualization

In [9]:
def randomcolor():
    r = random.random()
    b = random.random()
    g = random.random()
    rgb = [r,g,b]
    return rgb
list(wineQT.columns)[:-1]

In [10]:
sns.countplot(x=wineQT.quality).set_title('Target Distribution',size=15)
plt.show()

## Inference-
### Most of the wines are rated as either a 5 or a 6. 

In [11]:
for col in list(wineQT.columns[:-2]):    
    plt.figure(figsize=(15,7))
    sns.lineplot(data=wineQT, x="quality",y=col,color=randomcolor())
    plt.show()

## inferences-
### 1. clear increases with increase in quality- fixed acidity, citric acid
### 2. clearly decreases with decrease in quality- null
### 3. mostly increases - sulphates
### 4. mostly decreases- density, pH, total SO2, free SO2,chlorides,volatile acidity
### 3. mixed/no trend- residual sugar 

In [12]:
fig,ax=plt.subplots(6,2,figsize=(15,30))
sns.regplot(x=wineQT.quality,y=wineQT.quality,ax=ax[0][0], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['volatile acidity'],ax=ax[0][1], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['citric acid'],ax=ax[1][0], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['residual sugar'],ax=ax[1][1], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['chlorides'],ax=ax[2][0], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['free sulfur dioxide'],ax=ax[2][1], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['total sulfur dioxide'],ax=ax[3][0], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['density'],ax=ax[3][1], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['pH'],ax=ax[4][0], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['sulphates'],ax=ax[4][1], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['alcohol'],ax=ax[5][0], color = randomcolor())
sns.regplot(x=wineQT.quality,y=wineQT['fixed acidity'],ax=ax[5][1], color = randomcolor())

plt.show()

## Inferences-
### 1. negative correlation between volatile acidity & quality

### 2. positive correlation between citric acid & quality

### 3. slight positive correlation between residual sugar & quality

### 4. slight negative correlation between chlorides & quality

### 5.  slight negative correlation between free sulfur dioxide & quality

### 6.  negative correlation between total sulfur dioxide & quality

### 7.  slight negative correlation between density & quality

### 8.  slight negative correlation between pH & quality

### 9.  positive correlation between sulphates & quality

### 10. positive correlation between alcohol & quality

### 11.  slight  positive correlation between fixed acidity & quality

## Summary

### positive- citric acid, sulphates,alcohol
### slightly positive- residual sugar, sulphates, fixed acidity
### slightly negative - chlorides, density, pH
### negative - volatile acidity, total sulfur dioxide

In [13]:
fig,ax=plt.subplots(6,2,figsize=(15,30))
sns.countplot(x=wineQT.quality,ax=ax[0][0]).set_title('Target Distribution',size=15)
sns.boxplot(x=wineQT.quality,y=wineQT['volatile acidity'],ax=ax[0][1])
sns.boxplot(x=wineQT.quality,y=wineQT['citric acid'],ax=ax[1][0])
sns.boxplot(x=wineQT.quality,y=wineQT['residual sugar'],ax=ax[1][1])
sns.boxplot(x=wineQT.quality,y=wineQT['chlorides'],ax=ax[2][0])
sns.boxplot(x=wineQT.quality,y=wineQT['free sulfur dioxide'],ax=ax[2][1])
sns.boxplot(x=wineQT.quality,y=wineQT['total sulfur dioxide'],ax=ax[3][0])
sns.boxplot(x=wineQT.quality,y=wineQT['density'],ax=ax[3][1])
sns.boxplot(x=wineQT.quality,y=wineQT['pH'],ax=ax[4][0])
sns.boxplot(x=wineQT.quality,y=wineQT['sulphates'],ax=ax[4][1])
sns.boxplot(x=wineQT.quality,y=wineQT['alcohol'],ax=ax[5][0])
sns.boxplot(x=wineQT.quality,y=wineQT['fixed acidity'],ax=ax[5][1])
plt.show()

## Skewness

In [14]:
fig,ax=plt.subplots(6,2,figsize=(15,30))
skewness = str(skew(wineQT['fixed acidity']))
sns.distplot(wineQT['fixed acidity'],fit = norm,color = randomcolor(),ax=ax[0][0]).set_title("Skewness of " + 'fixed acidity'+ ' = '+ skewness)
skewness = str(skew(wineQT['volatile acidity']))
sns.distplot(wineQT['volatile acidity'],fit = norm,color = randomcolor(),ax=ax[0][1]).set_title("Skewness of " + 'volatile acidity'+ ' = '+ skewness)
skewness = str(skew(wineQT['citric acid']))
sns.distplot(wineQT['citric acid'],fit = norm,color = randomcolor(),ax=ax[1][0]).set_title("Skewness of " + 'citric acid'+ ' = '+ skewness)
skewness = str(skew(wineQT['residual sugar']))
sns.distplot(wineQT['residual sugar'],fit = norm,color = randomcolor(),ax=ax[1][1]).set_title("Skewness of " + 'residual sugar'+ ' = '+ skewness)
skewness = str(skew(wineQT['chlorides']))
sns.distplot(wineQT['chlorides'],fit = norm,color = randomcolor(),ax=ax[2][0]).set_title("Skewness of " + 'chlorides'+ ' = '+ skewness)
skewness = str(skew(wineQT['free sulfur dioxide']))
sns.distplot(wineQT['free sulfur dioxide'],fit = norm,color = randomcolor(),ax=ax[2][1]).set_title("Skewness of " + 'free sulfur dioxide'+ ' = '+ skewness)
skewness = str(skew(wineQT['total sulfur dioxide']))
sns.distplot(wineQT['total sulfur dioxide'],fit = norm,color = randomcolor(),ax=ax[3][0]).set_title("Skewness of " + 'total sulfur dioxide'+ ' = '+ skewness)
skewness = str(skew(wineQT['density']))
sns.distplot(wineQT['density'],fit = norm,color = randomcolor(),ax=ax[3][1]).set_title("Skewness of " + 'density'+ ' = '+ skewness)
skewness = str(skew(wineQT['pH']))
sns.distplot(wineQT['pH'],fit = norm,color = randomcolor(),ax=ax[4][0]).set_title("Skewness of " + 'pH'+ ' = '+ skewness)
skewness = str(skew(wineQT['sulphates']))
sns.distplot(wineQT['sulphates'],fit = norm,color = randomcolor(),ax=ax[4][1]).set_title("Skewness of " + 'sulphates'+ ' = '+ skewness)
skewness = str(skew(wineQT['alcohol']))
sns.distplot(wineQT['alcohol'],fit = norm,color = randomcolor(),ax=ax[5][0]).set_title("Skewness of " + 'alcohol'+ ' = '+ skewness)
skewness = str(skew(wineQT['alcohol']))
sns.distplot(wineQT['alcohol'],fit = norm,color = randomcolor(),ax=ax[5][1]).set_title("Skewness of " + 'alcohol'+ ' = '+ skewness)
plt.show()


## Inferences
### 1. 3 Columns - total SO2, chlorides, residual sugar are highly skewed
### 2. rest all are moderately skewed. 

### applying log transformation to highly skewed columns 

In [15]:
highly_skewed = ['total sulfur dioxide','chlorides','residual sugar']

In [16]:
for col in highly_skewed:
    wineQT[col]=np.log(wineQT[col])
wineQT.head()

In [17]:
heatmap = sns.heatmap(wineQT.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
fig = plt.gcf()
figsize = fig.get_size_inches()
fig.set_size_inches(figsize * 2)
plt.show()

## Variance Thresholding

In [18]:
from sklearn.feature_selection import VarianceThreshold

In [19]:
ax = wineQT.var(0).plot(kind='bar', label='Variance')
ax.hlines(10, 0, 13, label='Threshold')
plt.legend()

In [20]:
vt = VarianceThreshold(threshold=10)
x_reduced = vt.fit_transform(wineQT)
wineQT.columns[vt.get_support()]

In [21]:
x_reduced

In [22]:
wineQT.var(0)

In [23]:
X = wineQT.drop("quality",axis=1)
Y = wineQT.quality

thresholds = [0.01, 0.1, 10]
r2 = []
n_cols = []
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score
for t in thresholds:
    vt = VarianceThreshold(threshold=t)
    x_red = vt.fit_transform(X)
    n_cols.append(x_red.shape[1])
    lr = LinearRegression().fit(x_red, Y)
    p = lr.predict(x_red)
    r2.append(r2_score(Y, p))

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(thresholds, r2, 'ro--', label='R2')
ax.grid()
ax.set_xscale('log')
ax.set_xlabel('Threshold')
_ = ax.legend()

## Test-Train Split, Feature Selection

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
X_train, X_test,Y_train,Y_test = train_test_split(X, Y, test_size = 0.3,stratify=Y, random_state = 42)

In [25]:
# Standardization for X variables 
sc= StandardScaler()
sc.fit(X_train)
# Fitting Logistic model 
logit_model= LogisticRegression(solver='liblinear')
logit_model.fit(sc.transform(X_train), Y_train)
# Rank features by importance
feature= pd.DataFrame()
feature['column']= X_train.columns
feature['importance']= logit_model.coef_[0]
feature.sort_values('importance', ascending=False, inplace=True)
feature

## Model Building 

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_absolute_error ,mean_squared_error, median_absolute_error, explained_variance_score
 

In [27]:
X = wineQT.drop("quality", axis = 1)
Y = wineQT['quality']

### Linear Regression 

In [28]:
Lr = LinearRegression()
mod_train = Lr.fit(X_train, Y_train)
mod_pred = Lr.predict(X_test)
print("Mean Squared Error: \n{}".format(mean_squared_error(Y_test, mod_pred)))
print("Mean Absolute Error: \n{}".format(mean_absolute_error(Y_test, mod_pred)))
print("Median Absolute Error: \n{}".format(median_absolute_error(Y_test, mod_pred)))
print("Explained Variance: \n{}".format(explained_variance_score(Y_test, mod_pred)))
print("Score the X-train with Y-train is : ", Lr.score(X_train,Y_train))
print("Score the X-test  with Y-test  is : ", Lr.score(X_test,Y_test))

### Classification Models (Logistic Regression, Decision Tree, Random Forest, SVC)

In [29]:
Logit=LogisticRegression()

Logit.fit(X_train,Y_train)

print("Score the X-train with Y-train is : ", Logit.score(X_train,Y_train))
print("Score the X-test  with Y-test  is : ", Logit.score(X_test,Y_test))

Y_pred=Logit.predict(X_test)

print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy score " , accuracy_score(Y_test,Y_pred))


In [30]:
DT=DecisionTreeClassifier()

DT.fit(X_train,Y_train)

print("Score the X-train with Y-train is : ", DT.score(X_train,Y_train))
print("Score the X-test  with Y-test  is : ", DT.score(X_test,Y_test))

Y_pred=DT.predict(X_test)

print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy score " , accuracy_score(Y_test,Y_pred))


Perfect training score is indicative of overfitting. Using hyperparameter tuning to remove overfitting.

**HYPERPARAMETER TUNING**

In [None]:
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier()
grid_param={"criterion":["gini","entropy"],
             "splitter":["best","random"],
             "max_depth":range(2,50,1),
             "min_samples_leaf":range(1,15,1),
             "min_samples_split":range(2,20,1) 
            }
grid_search=GridSearchCV(estimator=dtc,param_grid=grid_param,cv=5,n_jobs=-1)
grid_search.fit(X_train,Y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
        
DT=DecisionTreeClassifier(criterion = 'gini',max_depth=36,min_samples_leaf = 4, min_samples_split = 12, splitter='random')

DT.fit(X_train,Y_train)

print("Score the X-train with Y-train is : ", DT.score(X_train,Y_train))
print("Score the X-test  with Y-test  is : ", DT.score(X_test,Y_test))

Y_pred=DT.predict(X_test)

print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy score " , accuracy_score(Y_test,Y_pred))

In [None]:
RF=RandomForestClassifier()

RF.fit(X_train,Y_train)

print("Score the X-train with Y-train is : ", RF.score(X_train,Y_train))
print("Score the X-test  with Y-test  is : ", RF.score(X_test,Y_test))

Y_pred=RF.predict(X_test)

print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy score " , accuracy_score(Y_test,Y_pred))

In [None]:
svc=SVC()

svc.fit(X_train,Y_train)

print("Score the X-train with Y-train is : ", svc.score(X_train,Y_train))
print("Score the X-test  with Y-test  is : ", svc.score(X_test,Y_test))

Y_pred=svc.predict(X_test)

print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy score " , accuracy_score(Y_test,Y_pred))