In [58]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv('data/winequality-white.csv', sep=';')

In [5]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [6]:
df = df.rename(columns={'fixed acidity': 'fixed_acidity',
                        'volatile acidity': 'volatile_acidity',
                        'citric acid': 'citric_acid', 
                        'residual sugar': 'residual_sugar',
                        'free sulfur dioxide': 'free_sulfur_dioxide',
                        'total sulfur dioxide': 'total_sulfur_dioxide'})

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         4898 non-null   float64
 1   volatile_acidity      4898 non-null   float64
 2   citric_acid           4898 non-null   float64
 3   residual_sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free_sulfur_dioxide   4898 non-null   float64
 6   total_sulfur_dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [17]:
# Separate the target feature, split data in 7:3 proportion (30% form a holdout set, use random_state=17), 
# and preprocess data with StandardScaler.

X, y = df.iloc[:, :-1], df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train)

In [31]:
# Question 1: What are mean squared errors of model predictions on train and holdout sets?

print('train MSE:', mean_squared_error(y_train, linreg.predict(X_train_scaled)))
print('test MSE:', mean_squared_error(y_test, linreg.predict(X_test_scaled)))

train MSE: 0.5580606489803572
test MSE: 0.5842473102404544


In [38]:
# Sort features by their influence on the target feature (wine quality).
# Question 2: Which feature this linear regression model treats as the most influential on wine quality?

feature_influence = pd.DataFrame({'coef_abs': np.abs(linreg.coef_), 'feature_name': X_train.columns})
feature_influence.sort_values(by='coef_abs', ascending=False)

# Answer: density

Unnamed: 0,coef_abs,feature_name
7,0.66572,density
3,0.538164,residual_sugar
1,0.19226,volatile_acidity
8,0.150036,pH
10,0.129533,alcohol
0,0.097822,fixed_acidity
9,0.062053,sulphates
5,0.04218,free_sulfur_dioxide
6,0.014304,total_sulfur_dioxide
4,0.008127,chlorides


In [40]:
# Train a LASSO model with alpha=0.01 (weak regularization) and scaled data. Again, set random_state=17.

lasso_reg = Lasso(alpha=0.01, random_state=17)
lasso_reg.fit(X_train_scaled, y_train)

In [42]:
# Which feature is the least informative in predicting wine quality, according to this LASSO model?

lasso_feature_importance = pd.DataFrame({'coef_abs': np.abs(lasso_reg.coef_), 'feature_name': X_train.columns})
lasso_feature_importance.sort_values(by='coef_abs', ascending=False)

# Answer: alcohol

Unnamed: 0,coef_abs,feature_name
10,0.322425,alcohol
3,0.256363,residual_sugar
7,0.235492,density
1,0.188479,volatile_acidity
8,0.067277,pH
5,0.043088,free_sulfur_dioxide
9,0.029722,sulphates
4,0.002747,chlorides
0,0.0,fixed_acidity
2,0.0,citric_acid


In [44]:
# Train LassoCV with random_state=17 to choose the best value of alpha in 5-fold cross-validation.

alphas = np.logspace(-6, 2, 200)
lasso_cv_reg = LassoCV(alphas=alphas, cv=5, random_state=17)
lasso_cv_reg.fit(X_train_scaled, y_train)

In [45]:
lasso_cv_reg.alpha_

0.0002833096101839324

In [74]:
# Question 3: Which feature is the least informative in predicting wine quality, according to the tuned LASSO model?

lassoCV_feature_importance = pd.DataFrame({'coef_abs': np.abs(lasso_cv_reg.coef_), 'feature_name': X_train.columns})
lassoCV_feature_importance.sort_values(by='coef_abs')

# Answer: citric_acid

Unnamed: 0,coef_abs,feature_name
2,0.0,citric_acid
4,0.006933,chlorides
6,0.012969,total_sulfur_dioxide
5,0.042698,free_sulfur_dioxide
9,0.060939,sulphates
0,0.093295,fixed_acidity
10,0.137115,alcohol
8,0.146549,pH
1,0.192049,volatile_acidity
3,0.526883,residual_sugar


In [49]:
# Question 4: What are mean squared errors of tuned LASSO predictions on train and holdout sets?

print('train MSE:', mean_squared_error(y_train, lasso_cv_reg.predict(X_train_scaled)))
print('test MSE:', mean_squared_error(y_test, lasso_cv_reg.predict(X_test_scaled)))

train MSE: 0.558070014187378
test MSE: 0.5832976077860635


In [51]:
# Train a Random Forest with out-of-the-box parameters, setting only random_state to be 17.

rf_reg = RandomForestRegressor(random_state=17)
rf_reg.fit(X_train_scaled, y_train)

In [57]:
# Question 5: What are mean squared errors of RF model on the training set, 
# in cross-validation (cross_val_score with scoring=’neg_mean_squared_error’ and other arguments left with default values) 
# and on holdout set?

print('train MSE:', mean_squared_error(y_train, rf_reg.predict(X_train_scaled)))
print('CV MSE:', np.abs(cross_val_score(estimator=rf_reg, X=X_train_scaled, y=y_train, scoring='neg_mean_squared_error')).mean())
print('test MSE:', mean_squared_error(y_test, rf_reg.predict(X_test_scaled)))

train MSE: 0.05261155192532089
CV MSE: 0.4142003732204039
test MSE: 0.37163775510204083


In [69]:
# Tune the max_features and max_depth hyperparameters with GridSearchCV and again check mean 
# cross-validation MSE and MSE on holdout set.

parametrs = {'max_features': range(6, 12), 'max_depth': range(10, 25)}
search = GridSearchCV(RandomForestRegressor(n_jobs=-1, random_state=17),param_grid=parametrs, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
search.fit(X_train_scaled, y_train)

In [70]:
best_reg = search.best_estimator_

In [71]:
# Question 6: What are mean squared errors of tuned RF model in cross-validation 
# (cross_val_score with scoring=’neg_mean_squared_error’ and other arguments left with default values) and on holdout set?

print('CV MSE:', np.abs(cross_val_score(estimator=best_reg, X=X_train_scaled, y=y_train, scoring='neg_mean_squared_error')).mean())
print('test MSE:', mean_squared_error(y_test, best_reg.predict(X_test_scaled)))

CV MSE: 0.39773288191505934
test MSE: 0.36572455603132475


In [73]:
# Question 7: What is the most important feature, according to the Random Forest model?

rf_importance = pd.DataFrame({'coef_abs': np.abs(best_reg.feature_importances_), 'feature_name': X_train.columns})
rf_importance.sort_values(by='coef_abs', ascending=False)

Unnamed: 0,coef_abs,feature_name
10,0.206056,alcohol
1,0.117578,volatile_acidity
5,0.111556,free_sulfur_dioxide
7,0.088549,density
8,0.073659,pH
6,0.07364,total_sulfur_dioxide
4,0.073366,chlorides
3,0.072072,residual_sugar
2,0.062601,citric_acid
0,0.061813,fixed_acidity
