In [1]:
import pandas as pd

In [2]:
wine = pd.read_csv("winequality-red.csv")

In [3]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
wine['alcohol'].value_counts()

9.500000     139
9.400000     103
9.800000      78
9.200000      72
10.000000     67
            ... 
9.950000       1
9.233333       1
9.250000       1
9.050000       1
10.750000      1
Name: alcohol, Length: 65, dtype: int64

In [6]:
wine['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [7]:
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
# import matplotlib.pyplot as plt
# wine.hist(bins=50, figsize=(20, 15))

## Train Test Splitting

In [9]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(wine, test_size = 0.2, random_state = 42)
print(len(train_set))
print(len(test_set))

1279
320


In [10]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(wine, wine['quality']):
    strat_train_set = wine.loc[train_index]
    strat_test_set = wine.loc[test_index]

In [11]:
strat_test_set['quality'].value_counts()

5    136
6    128
7     40
4     11
8      3
3      2
Name: quality, dtype: int64

In [12]:
strat_train_set['quality'].value_counts()

5    545
6    510
7    159
4     42
8     15
3      8
Name: quality, dtype: int64

In [13]:
wine = strat_train_set.copy()

## Correlations

In [14]:
corr_matrix = wine.corr()
corr_matrix['quality'].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.481197
sulphates               0.228050
citric acid             0.210802
fixed acidity           0.107940
residual sugar          0.003710
free sulfur dioxide    -0.048291
pH                     -0.052063
chlorides              -0.120231
density                -0.193009
total sulfur dioxide   -0.194511
volatile acidity       -0.383249
Name: quality, dtype: float64

In [15]:
# from pandas.plotting import scatter_matrix
# attributes = ["quality", "alcohol", "volatile acidity", "residual sugar"]
# scatter_matrix(wine[attributes], figsize = (12,8))

In [16]:
# wine.plot(kind="scatter", x="alcohol", y="quality", alpha=0.8)

In [17]:
wine = strat_train_set.drop("quality", axis=1)
wine_labels = strat_train_set["quality"].copy()

## Pipeline

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

In [19]:
wine_num_tr = my_pipeline.fit_transform(wine)

In [20]:
wine_num_tr.shape

(1279, 11)

## Selecting a model

In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [22]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledRDF', Pipeline([('Scaler', StandardScaler()),('RDF', RandomForestRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, wine_num_tr, wine_labels, scoring="neg_mean_squared_error", cv=10)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: -0.431639 (0.083564)
ScaledLASSO: -0.653789 (0.098001)
ScaledEN: -0.653789 (0.098001)
ScaledKNN: -0.475735 (0.071906)
ScaledCART: -0.665631 (0.147246)
ScaledGBM: -0.387066 (0.071844)
ScaledRDF: -0.350455 (0.076083)


In [23]:
model = RandomForestRegressor()
# model = GradientBoostingRegressor()
model.fit(wine_num_tr, wine_labels)

RandomForestRegressor()

## Evaluation

In [24]:
from sklearn.metrics import mean_squared_error
import numpy as np
wine_predictions = model.predict(wine_num_tr)
mse = mean_squared_error(wine_labels, wine_predictions)
rmse = np.sqrt(mse)

In [25]:
rmse

0.21440044956950519

## Cross Validation Evaluation

In [26]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, wine_num_tr, wine_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores

array([0.6157281 , 0.6380598 , 0.5392631 , 0.5715467 , 0.46315579,
       0.53493647, 0.52920858, 0.62436405, 0.67090948, 0.66922331])

In [27]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [28]:
print_scores(rmse_scores)

Scores: [0.6157281  0.6380598  0.5392631  0.5715467  0.46315579 0.53493647
 0.52920858 0.62436405 0.67090948 0.66922331]
Mean:  0.5856395377712442
Standard deviation:  0.06520499253164623


In [29]:
from joblib import dump, load
dump(model, 'Wine.joblib') 

['Wine.joblib']

## Test On Data

In [33]:
X_test = strat_test_set.drop("quality", axis=1)
Y_test = strat_test_set["quality"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
# print(final_predictions, list(Y_test))

[6.47 5.64 5.02 5.59 5.41 6.2  5.62 4.82 5.53 7.5  5.   5.5  5.16 5.04
 5.11 6.09 5.68 5.03 6.88 6.03 5.86 5.93 5.71 5.73 5.21 5.57 5.59 5.58
 6.21 5.02 5.37 4.99 6.22 5.91 7.43 5.48 5.19 6.25 5.24 5.89 5.25 5.47
 5.5  6.18 6.27 6.33 6.36 6.08 5.48 5.43 5.01 6.33 4.98 5.55 6.37 5.01
 4.91 5.46 5.16 5.59 5.91 5.91 5.64 6.24 6.24 5.51 6.03 5.38 5.21 6.03
 6.38 5.9  6.19 5.4  4.67 6.5  5.88 6.13 5.16 6.26 5.25 5.61 5.42 5.15
 5.   4.97 5.6  5.62 6.84 5.55 5.14 5.18 5.99 6.33 5.49 5.81 5.16 5.16
 6.1  4.7  7.01 6.64 6.51 6.54 5.81 6.22 6.18 6.89 6.66 6.1  5.05 5.04
 5.18 5.2  5.58 5.97 5.4  5.12 6.36 5.87 6.57 5.89 6.37 5.87 5.88 5.89
 6.63 5.03 5.06 6.58 5.24 5.36 4.48 5.66 6.49 6.22 5.   6.09 5.03 5.74
 5.12 5.53 5.28 5.05 5.62 6.34 5.32 6.04 5.21 5.64 6.57 5.03 5.04 5.71
 5.18 5.07 5.04 5.14 6.41 5.78 6.02 5.07 5.54 6.91 5.49 6.35 5.42 5.94
 5.46 5.93 6.35 5.12 5.42 6.41 7.02 5.28 6.08 5.08 5.67 6.99 5.66 5.24
 5.52 5.32 5.11 5.51 6.13 5.2  5.26 6.03 6.13 5.29 5.18 5.29 5.65 6.85
 5.91 

In [31]:
final_rmse

0.5668335844848997

## Using the model

In [32]:
from joblib import dump, load
import numpy as np
model = load('wine.joblib') 
features = np.array([[5.9, 0.645, 0.12, 2, 0.075, 32, 44, 0.99547, 3.57, 0.71, 10.2
]])
model.predict(features)

array([5.82])