In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib



In [5]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df = pd.read_csv(url, sep=";")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [7]:
y = df.quality
x = df.drop('quality', axis=1)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   test_size=.2,
                                                   random_state=123,
                                                   stratify=y)

stratify - rozwarstwianie. This will ensure your training set looks similar to your test set, making your evaluation metrics more reliable. DAJE PRZEDZIAŁ

większość metod ML zakłada, że wartości beda w przedziale 0-1. Wiec aby to osiagnac, robi się standaryzację.
- fit the transformer on the training set (saving the means and standard deviations 
- apply the transformer to the training set (scales the data)
- apply the transformer to test set (using the same means and standard deviation)

*deviation* - odchylenie

In [24]:
scaler = preprocessing.StandardScaler().fit(x_train)

In [25]:
x_train_scaled = scaler.transform(x_train)

print(x_train_scaled.mean(axis=0))
print(x_train_scaled.std(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [26]:
x_test_scaled = scaler.transform(x_test)

print(x_test_scaled.mean(axis=0))
print(x_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


w praktyce, kiedy ogarniamy cross-validation pipeline, nie musimy nawet recznie fitowac tranformer API. zamiast tego deklaruje się klasę obiektu:

In [27]:
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

In [28]:
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                         max_depth=None, max_features='auto', max_leaf_nodes=None,
                         max_samples=None, min_impurity_decrease=0.0,
                         min_impurity_split=None, min_samples_leaf=1,
                         min_samples_split=2, min_weight_fraction_leaf=0.0,
                         n_estimators=100, n_jobs=None, oob_score=False,
                         random_state=None, verbose=0, warm_start=False))],
 'verbose': False,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
 

In [29]:
hyperparams = {'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
              'randomforestregressor__max_depth': [None, 5, 3, 1]}

CV - crossvalidation. Trenowanie i evaluowanie modelu kilkukrotnie używjąc tej samej metody.
- splitowanie data w k rownych części (k=10)
- trenowanie modelu na k-1 'foldach' (do 9)
- evaluowanie ich na pozostalym 'hold-out'-cie (10tym)
- powtorz steps 2 i 3 k-razy, za każdym razem z innym foldem
- średnia ze wszystkich performacji w k foldach.

**CROSS VALIDATION PIPELINE**

The best practice when performing CV is to include your data preprocessing steps inside the cross-validation loop. This prevents accidentally tainting your training folds with influential data from your test fold.

Najlepszą praktyką podczas wykonywania CV jest uwzględnienie etapów wstępnego przetwarzania danych w cross-validation loop. Zapobiega to przypadkowemu skażeniu training folds wpływowymi danymi z test folds.



- split data into k equal folds (k=10)
- preprocess k-1 training folds
- train model on k-1 folds
- preprocess the hld-out (10th) fold using the same transformations from step 2
- evaluate ur model on the same hold-out fold.
- perform steps 2-5 k times, with different fold each time
- średnia z performacji

GridSearchCV essentially performs cross-validation across the entire "grid" (all possible permutations) of hyperparameters.

In [30]:
clf = GridSearchCV(pipeline, hyperparams, cv=10)

clf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [31]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

po tunowaniu hyperparametrów z użyciem CV, można ogarnąc mały performance improvement by refitting the model on the entire set.

dla naszej wygody, GridSearchCV automatycznie refituje model (tutaj pipeline) z najlepszymi wybranymi parametrami.

In [32]:
clf.refit

True

In [33]:
y_pred = clf.predict(x_test)

In [36]:
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.4634813245353836
0.34620125


In [37]:
joblib.dump(clf, 'rf_regressor.pkl')
clf2 = joblib.load('rf_regressor.pkl')
clf2.predict(x_test)

array([6.47, 5.78, 4.96, 5.41, 6.19, 5.47, 4.99, 4.77, 5.01, 6.02, 5.28,
       5.74, 5.85, 5.11, 5.78, 5.65, 6.5 , 5.73, 5.79, 6.97, 5.4 , 5.65,
       5.1 , 6.01, 5.91, 4.99, 5.32, 5.21, 5.94, 5.9 , 5.82, 6.57, 6.  ,
       5.08, 5.03, 5.95, 5.04, 6.1 , 5.02, 6.05, 4.92, 5.96, 6.54, 5.11,
       6.22, 5.37, 5.49, 5.53, 5.06, 6.41, 6.11, 5.28, 5.73, 5.16, 5.58,
       5.92, 5.35, 5.42, 5.04, 5.26, 5.25, 5.16, 5.04, 5.85, 5.93, 5.21,
       6.46, 5.  , 5.15, 6.61, 5.66, 5.91, 5.08, 5.01, 5.25, 5.98, 5.42,
       5.08, 5.24, 5.24, 6.42, 5.56, 6.11, 6.35, 5.11, 6.04, 6.53, 6.4 ,
       5.93, 5.76, 5.86, 5.31, 6.29, 5.63, 5.75, 5.79, 6.74, 6.76, 5.53,
       6.77, 5.12, 5.5 , 5.11, 6.37, 5.03, 4.8 , 5.75, 4.96, 5.56, 5.98,
       5.92, 5.44, 5.97, 5.37, 5.16, 5.3 , 5.93, 5.02, 4.98, 5.94, 5.88,
       5.07, 5.73, 6.14, 5.24, 5.28, 5.38, 5.99, 5.49, 5.4 , 5.76, 6.2 ,
       5.16, 5.19, 5.1 , 6.4 , 5.  , 5.19, 6.72, 5.52, 5.14, 5.15, 5.54,
       6.05, 5.28, 5.49, 5.15, 6.57, 5.62, 5.15, 5.