In [22]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 

df_orig = pd.read_csv('./winequality-white.csv', sep=';')
print(df_orig.head(10))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   
5            8.1              0.28         0.40             6.9      0.050   
6            6.2              0.32         0.16             7.0      0.045   
7            7.0              0.27         0.36            20.7      0.045   
8            6.3              0.30         0.34             1.6      0.049   
9            8.1              0.22         0.43             1.5      0.044   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0

In [24]:
# standardization
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df_orig))

# split into explanatory and response variables. # chosen response variable: quality
X = df.iloc[:,0:11]
Y = df.iloc[:,11]

# split dataset (80/20)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# build and fit model
reg = linear_model.LinearRegression()
reg.fit(X,Y)

# output model
print("Coefficients:" ,reg.coef_)
print("Intercept:", reg.intercept_)

Coefficients: [ 0.06242977 -0.21204823  0.00301856  0.46665253 -0.00610011  0.07168122
 -0.01371181 -0.50752757  0.11702101  0.0813738   0.26884011]
Intercept: 1.7028861543364174e-14


In [25]:
# Regression equation (standardized dataset):
# y = 0.06242977 * x1 + (-0.21204823 * x2) + 0.00301856 * x3 + (0.46665253 * x4) + 
# (-0.00610011 * x5) + 0.07168122 * x6 + (-0.01371181 * x7) + (-0.50752757 * x8) +
# 0.11702101 * x9 + 0.0813738 * x10 + 0.26884011 * x11 + 1.7028861543364174e-14

# Regression equation (non-standardized dataset):
# y = 6.55199614e-02 * x1 + (-1.86317709e+00 * x2) + 2.20902007e-02 * x3 + (8.14828026e-02 * x4) + 
# (-2.47276537e-01 * x5) + 3.73276519e-03 * x6 + (-2.85747419e-04 * x7) + (-1.50284181e+02 * x8) +
# 6.86343742e-01 * x9 + 6.31476473e-01 * x10 + 1.93475697e-01 * x11 + 150.19284248121068

# Top five most useful variables:
# 1: density
# 2: residual sugar
# 3: alcohol
# 4: volatile acidity
# 5: pH

In [27]:
# use the model to compute predicted values for test set
Y_pred = reg.predict(X_test)

# compute error statistics
mse = mean_squared_error(Y_test, Y_pred)
r2s = r2_score(Y_test, Y_pred) 
print("MSE = ", mse)
print("R2s = ", r2s)

MSE =  0.8298207567223476
R2s =  0.2616765177332424
