In [9]:
import numpy as np
import pandas as pd
from IPython import display
from sklearn.model_selection import train_test_split
from sklearn import preprocessing  # scaling, transform, data wrangling
from sklearn.ensemble import RandomForestRegressor  # import model
# import cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
# import evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score
# import model for saving sckit-learn models
from sklearn.externals import joblib  # alt to py pickle, joblib is more efficient with large numpy arrays

In [6]:
# load dataset
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')

In [11]:
display.display(data.head())
print('Shape:', data.shape)
display.display(data.describe())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Shape: (1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [12]:
# splitting
y = data.quality
X = data.drop(['quality'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
# stratify will make sure your tarining data looks like your test data

In [14]:
# preprocessing
# defining scaler for train and test data based on train data
scaler = preprocessing.StandardScaler().fit(X_train)

# tranforming/standardizing train data
X_train_scaled = scaler.transform(X_train)

# transforming test data
X_test_scaled = scaler.transform(X_test)

# printing means and stds
print('train mean:\n', X_train_scaled.mean(axis=0))
print('train std:\n', X_train_scaled.std(axis=0))
print('test mean:\n', X_test_scaled.mean(axis=0))
print('test std:\n', X_test_scaled.std(axis=0))

train mean:
 [ 5.20823936e-17 -1.74996843e-16 -4.99990979e-17 -2.37495715e-16
  8.12485341e-17  3.33327319e-17  4.44436426e-17  4.31360273e-14
  7.49986469e-16 -3.88881873e-16  5.51378941e-16]
train std:
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
test mean:
 [-0.15354581  0.15037347 -0.13262085 -0.0425098  -0.02133579  0.00299966
 -0.00927774 -0.1267174   0.04464016 -0.05974993  0.04843223]
test std:
 [0.87868488 1.05048996 0.99931514 0.88848329 0.99309864 1.0022948
 1.02540273 0.94357895 0.99836723 0.9476855  1.02214932]


In [15]:
# in practice, we won't maually fit tranformer api, instead we setup cross-validation pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(),
                        RandomForestRegressor(n_estimators=100))

In [16]:
# decalre hyperparameters to tune
# list tunable hyperparameters
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))], 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=Fals

In [17]:
hyperparameters = {'randomforestregressor__criterion': ['mse', 'mae'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1],
                  'randomforestregressor__max_features': ['auto', 'sqrt', 'log2']}

In [18]:
# cross-validation step, so easy in sckit-learn
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

# fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__criterion': ['mse', 'mae'], 'randomforestregressor__max_depth': [None, 5, 3, 1], 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
print(clf.best_params_)

{'randomforestregressor__criterion': 'mse', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


In [21]:
# refit model
clf.refit

# pedict
y_pred = clf.predict(X_test)

# eval matrix
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.4439121012167807
0.35882875000000003


In [24]:
# save the model
joblib.dump(clf, 'regressor_pkl')

# load data from .pkl file
clf2 = joblib.load('regressor_pkl')
clf2.predict(X_test)

array([5.11, 5.61, 5.18, 5.12, 5.88, 5.08, 5.39, 5.43, 6.73, 6.35, 5.72,
       5.64, 4.93, 5.9 , 5.68, 5.37, 5.15, 5.2 , 5.19, 5.94, 5.35, 5.08,
       6.08, 5.92, 5.18, 5.89, 5.87, 6.04, 5.27, 5.46, 5.57, 5.27, 5.44,
       5.74, 5.83, 6.65, 5.57, 6.43, 5.09, 6.43, 5.98, 5.68, 5.41, 6.33,
       6.29, 5.06, 5.16, 5.87, 5.46, 5.16, 5.4 , 4.99, 6.05, 6.45, 4.95,
       5.3 , 5.5 , 5.45, 5.12, 6.27, 6.2 , 5.4 , 5.07, 6.19, 5.23, 6.43,
       5.22, 4.99, 6.03, 5.4 , 6.09, 5.52, 5.58, 5.69, 5.35, 6.27, 5.66,
       5.26, 6.85, 5.32, 4.92, 5.25, 6.38, 5.12, 5.81, 6.98, 5.6 , 5.09,
       4.85, 5.36, 5.07, 5.7 , 5.08, 5.24, 5.66, 5.4 , 5.49, 5.29, 5.27,
       4.97, 5.04, 6.26, 5.99, 5.74, 5.69, 5.89, 5.53, 5.85, 5.11, 5.99,
       5.18, 5.04, 5.69, 5.42, 6.06, 5.6 , 5.52, 5.68, 5.88, 5.93, 6.72,
       5.56, 5.11, 6.03, 5.46, 5.44, 5.19, 5.25, 6.06, 5.01, 6.68, 5.1 ,
       6.47, 6.26, 5.59, 5.06, 6.49, 5.09, 5.42, 5.74, 6.42, 5.15, 5.24,
       5.14, 5.88, 5.86, 5.05, 5.91, 5.59, 6.13, 6.