# Classification

## Load pandas and sklearn

In [47]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



## Loading the R magic along with the tidyverse

In [48]:
%load_ext rpy2.ipython
%R R.version.string
%R library(tidyverse)

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


0,1,2,3,4,5,6
'forcats','stringr','dplyr',...,'datasets','methods','base'




## Training and testing sets

* Load the data
* Define the model matrices
* Split into training and testing

In [49]:
data = pd.read_csv('data/housing_tidy.csv')

x, y = data.drop(['MEDV'], axis=1), data[['MEDV']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [50]:
# define model
def pipe(n_components=None):
  return Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=n_components)),
    ('rgr', LinearRegression())
  ])
model = pipe()

# define cross-validation
fold = KFold(n_splits=5, shuffle=True, random_state=42)

# define grid search
grid = {
  'pca__n_components': range(1, x.shape[1] + 1)
}
gs = GridSearchCV(model, grid,scoring='neg_mean_absolute_percentage_error',cv=fold, verbose=1)

# fit model
gs.fit(x_train, y_train)

# evaluate model
print ('gs.best_score_:', gs.best_score_)
print ('gs.best_params_:', gs.best_params_)

def metrics(y_test, y_pred):
  return {
    'mse': mean_squared_error(y_test, y_pred),
    'mae': mean_absolute_error(y_test, y_pred),
    'mape': mean_absolute_percentage_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred)
  }

y_pred = gs.predict(x_test)
print(metrics(y_test, y_pred))

Fitting 5 folds for each of 13 candidates, totalling 65 fits
gs.best_score_: -0.16994613305572956
gs.best_params_: {'pca__n_components': 12}
{'mse': 26.096055263393207, 'mae': 3.3237656233710107, 'mape': 0.17833638745204228, 'r2': 0.6441468837583112}


In [62]:
# Let's try random forests
forest = RandomForestRegressor(random_state=0)

grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [None, 4, 5, 6, 7, 8]
}
gs = GridSearchCV(forest, grid,scoring='neg_mean_absolute_percentage_error',cv=fold, verbose=1)

# fit model
gs.fit(x_train, y_train)

# evaluate model
print ('gs.best_score_:', gs.best_score_)
print ('gs.best_params_:', gs.best_params_)

def metrics(y_test, y_pred):
  return {
    'mse': mean_squared_error(y_test, y_pred),
    'mae': mean_absolute_error(y_test, y_pred),
    'mape': mean_absolute_percentage_error(y_test, y_pred),
    'r2': r2_score(y_test, y_pred)
  }

y_pred = gs.predict(x_test)
print(metrics(y_test, y_pred))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
gs.best_score_: -0.11537667814112144
gs.best_params_: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500}
{'mse': 10.018216780000062, 'mae': 1.9781156862745144, 'mape': 0.1015224236135359, 'r2': 0.863388790973757}
