In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

In [12]:
# dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
# data = pd.read_csv(dataset_url)
data1 = pd.read_csv('train.csv')
print(data1.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name Sex_name  Sex   Age  \
0                            Braund, Mr. Owen Harris     male    0  22.0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...   female    1  38.0   
2                             Heikkinen, Miss. Laina   female    1  26.0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   female    1  35.0   
4                           Allen, Mr. William Henry     male    0  35.0   

   SibSp  Parch            Ticket     Fare Cabin Embarked_name  Embarked  
0      1      0         A/5 21171   7.2500   NaN             S       3.0  
1      1      0          PC 17599  71.2833   C85             C       1.0  
2      0      0  STON/O2. 3101282   7.9250   NaN             S       3.0  
3      1      0           

In [13]:
# Pre-format DataFrame
data = data1.drop(['Name', 'Ticket', 'Sex_name', 'Embarked_name', 'Cabin'], axis=1)
print(data.head())

   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0            1         0       3    0  22.0      1      0   7.2500       3.0
1            2         1       1    1  38.0      1      0  71.2833       1.0
2            3         1       3    1  26.0      0      0   7.9250       3.0
3            4         1       1    1  35.0      1      0  53.1000       3.0
4            5         0       3    0  35.0      0      0   8.0500       3.0


In [14]:
print(data.shape)
# 891 samples (rows) and 12 features (columns)

(891, 9)


In [15]:
print(data.describe())

       PassengerId    Survived      Pclass         Sex         Age  \
count   891.000000  891.000000  891.000000  891.000000  714.000000   
mean    446.000000    0.383838    2.308642    0.352413   29.699118   
std     257.353842    0.486592    0.836071    0.477990   14.526497   
min       1.000000    0.000000    1.000000    0.000000    0.420000   
25%     223.500000    0.000000    2.000000    0.000000   20.125000   
50%     446.000000    0.000000    3.000000    0.000000   28.000000   
75%     668.500000    1.000000    3.000000    1.000000   38.000000   
max     891.000000    1.000000    3.000000    1.000000   80.000000   

            SibSp       Parch        Fare    Embarked  
count  891.000000  891.000000  891.000000  889.000000  
mean     0.523008    0.381594   32.204208    2.535433  
std      1.102743    0.806057   49.693429    0.792088  
min      0.000000    0.000000    0.000000    1.000000  
25%      0.000000    0.000000    7.910400    2.000000  
50%      0.000000    0.000000   1

In [16]:
#Age and Embarked are missing values. Need to fill them in with their mean values
#this will replace all NaN values with the mean of the non null values

mean_value=data['Age'].mean()
data['Age']=data['Age'].fillna(mean_value)

mean_value=data['Embarked'].mean()
data['Embarked']=data['Embarked'].fillna(mean_value)

print(data.describe())

       PassengerId    Survived      Pclass         Sex         Age  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642    0.352413   29.699118   
std     257.353842    0.486592    0.836071    0.477990   13.002015   
min       1.000000    0.000000    1.000000    0.000000    0.420000   
25%     223.500000    0.000000    2.000000    0.000000   22.000000   
50%     446.000000    0.000000    3.000000    0.000000   29.699118   
75%     668.500000    1.000000    3.000000    1.000000   35.000000   
max     891.000000    1.000000    3.000000    1.000000   80.000000   

            SibSp       Parch        Fare    Embarked  
count  891.000000  891.000000  891.000000  891.000000  
mean     0.523008    0.381594   32.204208    2.535433  
std      1.102743    0.806057   49.693429    0.791197  
min      0.000000    0.000000    0.000000    1.000000  
25%      0.000000    0.000000    7.910400    2.000000  
50%      0.000000    0.000000   1

In [23]:
y = data.Survived
X = data.drop('Survived', axis=1)
# Only 8 features (columns) remaining

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [53]:
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
 
print(X_train_scaled.mean(axis=0))
# [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

print(X_train_scaled.std(axis=0))
# [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]

[ 1.49692992e-17 -1.44703226e-16  3.74232480e-17  1.89611123e-16
  3.49283648e-17  4.98976640e-17 -3.49283648e-17  8.48260288e-17]
[1. 1. 1. 1. 1. 1. 1. 1.]


In [54]:
X_test_scaled = scaler.transform(X_test)
 
print(X_test_scaled.mean(axis=0))
# [ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
#  -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
 
print(X_test_scaled.std(axis=0))
# [ 1.02160495  1.00135689  0.97456598  0.91099054  0.86716698  0.94193125
#  1.03673213  1.03145119  0.95734849  0.83829505  1.0286218 ]

[-0.07073273 -0.1549711  -0.01581775  0.1133593  -0.02248195  0.01449631
  0.07546479  0.06245481]
[0.977465   1.06416147 0.99503086 0.96507138 0.89306481 0.92643505
 0.90556969 0.93164716]


In [55]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [57]:
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurit

In [58]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [59]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [60]:
print(clf.best_params_)

{'randomforestregressor__max_depth': 3, 'randomforestregressor__max_features': 'auto'}


In [61]:
# Conveniently, GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set.
# This functionality is ON by default, but you can confirm it:

print(clf.refit)

True


In [62]:
# Predict a new set of data
y_pred = clf.predict(X_test)

In [63]:
print(r2_score(y_test, y_pred))
# between 0 and 1. 1 means perfect correlation
 
print(mean_squared_error(y_test, y_pred))
# close to 0 as possible, less error

0.4626694040860574
0.12728501679057533


In [64]:
# Save model to a .pkl filePython
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [65]:
# Load model from .pkl filePython (load the model again, simply use this function:)
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([0.10667013, 0.96684717, 0.10667013, 0.96684717, 0.64830675,
       0.28665136, 0.10667013, 0.10667013, 0.10667013, 0.94315691,
       0.64808482, 0.10667013, 0.96684717, 0.10667013, 0.39169015,
       0.96684717, 0.10667013, 0.52348282, 0.10667013, 0.10951982,
       0.11038006, 0.11038006, 0.35597591, 0.32730076, 0.93806526,
       0.22165377, 0.92203344, 0.11038006, 0.32104424, 0.96297983,
       0.33572531, 0.48190376, 0.48367496, 0.11311472, 0.10594286,
       0.10667013, 0.68316454, 0.47686053, 0.10951982, 0.5232472 ,
       0.34102002, 0.64487602, 0.96297983, 0.92382647, 0.91904105,
       0.35323216, 0.69492905, 0.63654225, 0.10866007, 0.11038006,
       0.3249175 , 0.1079328 , 0.23011092, 0.35657731, 0.10667013,
       0.11038006, 0.22566602, 0.96297983, 0.36140413, 0.11311472,
       0.10667013, 0.34367361, 0.10900506, 0.1178635 , 0.10951982,
       0.37691391, 0.30002398, 0.12448255, 0.92278922, 0.10667013,
       0.68635965, 0.47662491, 0.63906981, 0.95927601, 0.92665