In [2]:
#Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [3]:
#Load red wine data
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [4]:
#Read first rows
print( data.head() )

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [5]:
#Separate data by colons
data = pd.read_csv(dataset_url, sep=';')
print( data.head() )

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [6]:
print( data.shape )

(1599, 12)


In [8]:
#List of all features
print( data.describe() )

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [14]:
#Here is the list of all the features:
#quality (target)
#fixed-acidity
#volatile-acidity
#citric-acid
#residual-sugar
#chlorides
#free sulfur dioxide
#total sulfur dioxide
##density
#pH
#sulphates
#alcohol

In [16]:
# Split data into training and test sets.
y = data.quality
X = data.drop('quality', axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [18]:
#Declare data preprocessing steps.
X_train_scaled = preprocessing.scale(X_train)
print( X_train_scaled )

[[ 0.51358886  2.19680282 -0.164433   ...  1.08415147 -0.69866131
  -0.58608178]
 [-1.73698885 -0.31792985 -0.82867679 ...  1.46964764  1.2491516
   2.97009781]
 [-0.35201795  0.46443143 -0.47100705 ... -0.13658641 -0.35492962
  -0.20843439]
 ...
 [-0.98679628  1.10708533 -0.93086814 ...  0.24890976 -0.98510439
   0.35803669]
 [-0.69826067  0.46443143 -1.28853787 ...  1.08415147 -0.35492962
  -0.68049363]
 [ 3.1104093  -0.62528606  2.08377675 ... -1.61432173  0.79084268
  -0.39725809]]


In [19]:
print( X_train_scaled.mean(axis=0) )
print( X_train_scaled.std(axis=0) )

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.77772766e-18 -6.38877362e-17 -4.16659149e-18 -1.20753377e-13
 -8.70817622e-16 -4.08325966e-16 -1.16664562e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [20]:
#Now, the scaler object has the saved means and standard deviations for each feature in the training set.
scaler = preprocessing.StandardScaler().fit(X_train)

In [24]:
X_train_scaled = scaler.transform(X_train)

In [27]:
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.77772766e-18 -6.38877362e-17 -4.16659149e-18 -1.20753377e-13
 -8.70817622e-16 -4.08325966e-16 -1.16664562e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [30]:
#Note how we’re taking the scaler object and using it to transform the training set. Later, we can transform the test set using the exact same means and standard deviations used to transform the training set:
X_test_scaled = scaler.transform(X_test)
print (X_test_scaled.mean(axis=0))
print (X_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [31]:
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100,
                                               random_state=123))

In [34]:
#This is exactly what it looks like: a modeling pipeline that first transforms the data using StandardScaler() and then fits a model using a random forest regressor. Again, the random_state= parameter can be any number you choose. It’s simply setting the seed so that you get consistent results each time you run the code.

In [35]:
#Declare hyperparameters to tune.
print( pipeline.get_params() )

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor(random_state=123))], 'verbose': False, 'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(random_state=123), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'randomforestregressor__bootstrap': True, 'randomforestregressor__ccp_alpha': 0.0, 'randomforestregressor__criterion': 'mse', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_samples': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'randomforestregressor__min_impurity_split': None, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__n_estimators': 100, 'randomforestregressor__n_jobs': None, 'rand

In [36]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [37]:
#Tune model using a cross-validation pipeline.
#Here’s how the CV pipeline looks after including preprocessing steps:

#Split your data into k equal parts, or “folds” (typically k=10).
#Preprocess k-1 training folds.
#Train your model on the same k-1 folds.
#Preprocess the hold-out fold using the same transformations from step (2).
#Evaluate your model on the same hold-out fold.
#Perform steps (2) – (5) k times, each time holding out a different fold.
#Aggregate the performance across all k folds. This is your performance metric.
#Fortunately, Scikit-Learn makes it stupidly simple to set this up:

In [38]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=123))]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [39]:
#Now, you can see the best set of parameters found using CV:
print( clf.best_params_ )

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


In [40]:
#Refit on the entire training set
#Conveniently, GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set.
#This functionality is ON by default, but you can confirm it:

print( clf.refit )

True


In [41]:
#Evaluate model pipeline on test data.
#Here’s how to predict a new set of data:
y_pred = clf.predict(X_test)
#Now we can use the metrics we imported earlier to evaluate our model performance.
print( r2_score(y_test, y_pred) )
print( mean_squared_error(y_test, y_pred) )

0.4712595193413647
0.34118218749999996


In [42]:
#Save model for future use
#But before you go, let’s save your hard work so you can use the model in the future. It’s really easy to do so:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [43]:
#And that’s it. When you want to load the model again, simply use this function:
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([6.44, 5.76, 4.99, 5.63, 6.27, 5.61, 4.89, 4.74, 5.03, 5.83, 5.26,
       5.68, 5.85, 5.11, 5.86, 5.6 , 6.52, 5.8 , 5.75, 6.96, 5.42, 5.68,
       5.1 , 6.02, 5.95, 5.03, 5.33, 5.22, 6.04, 5.9 , 5.89, 6.48, 6.01,
       5.03, 4.95, 5.95, 5.05, 6.1 , 5.15, 6.05, 4.92, 5.92, 6.71, 5.08,
       6.23, 5.33, 5.49, 5.53, 5.22, 6.4 , 6.05, 5.25, 5.81, 5.22, 5.6 ,
       5.64, 5.38, 5.38, 5.04, 5.12, 5.28, 5.24, 5.03, 5.8 , 6.01, 5.33,
       6.43, 5.03, 5.14, 6.64, 5.78, 5.84, 5.09, 5.06, 5.34, 6.01, 5.29,
       5.06, 5.21, 5.26, 6.33, 5.66, 6.11, 6.33, 5.09, 5.94, 6.55, 6.3 ,
       5.74, 5.71, 5.86, 5.37, 6.3 , 5.69, 5.64, 5.78, 6.68, 6.77, 5.53,
       6.78, 5.16, 5.34, 5.1 , 6.49, 5.07, 4.69, 5.61, 5.05, 5.66, 5.9 ,
       5.92, 5.47, 6.01, 5.28, 4.92, 5.19, 5.92, 5.13, 5.05, 6.01, 5.87,
       5.06, 5.79, 5.99, 5.26, 5.43, 5.28, 5.89, 5.57, 5.45, 5.71, 6.01,
       5.2 , 5.33, 5.08, 6.38, 5.  , 5.22, 6.71, 5.4 , 5.13, 5.11, 5.62,
       6.04, 5.34, 5.34, 5.07, 6.48, 5.71, 5.12, 5.