# Lab 5

*  	Write code to load a dataset and split it into X_train, y_train, X_test, y_test
*  	Write a pipeline to preprocess your features. Apply it to your train and test set.
*  	Write code to find good hyperparameters for a given model.
*  	Write code to evaluate your model.



In [1]:
import pandas as pd
import numpy as np # start Python file with imports

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import metrics

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from imblearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier as DT

import matplotlib.pyplot as plt
%matplotlib inline

## Load Data

In [2]:
data = pd.read_csv('housing.csv', delimiter = ',')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Train Test Split

In [4]:
x = data.drop('median_house_value', axis = 1)
y = data['median_house_value']

In [5]:
x_train, x_test, y_train, y_test = train_test_split (x, y)

## Preprocessing Pipeline

In [6]:
#select columns' indices that are categorical
cat_cols = x_train.select_dtypes(include='object').columns
indices = []
for col in cat_cols:
    indices.append(x_train.columns.get_loc(col))
indices

[8]

In [7]:
#function to encode categorical features to numeric
transformer = ColumnTransformer(transformers=[('categorical', 
                                               OneHotEncoder(handle_unknown = 'ignore'), indices)])

In [8]:
pipeline = make_pipeline(transformer, StandardScaler(with_mean = False), 
                         DT(random_state=2021))


pipeline.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  [8])])),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(random_state=2021))])

## GridSearch for Best Parameters

In [9]:
grid = [{'decisiontreeclassifier__criterion': ['gini', 'entropy'],
         'decisiontreeclassifier__max_depth': [None, 2, 6, 10], 
         'decisiontreeclassifier__min_samples_split': [2, 3, 5, 10]}]

gridsearch = GridSearchCV(estimator=pipeline, 
                          param_grid=grid, 
                          scoring='accuracy', 
                          cv=5)

gridsearch.fit(x_train, y_train)

gridsearch.score(x_test, y_test)



0.053488372093023255

In [10]:
print('Best Criterion:', gridsearch.best_estimator_.get_params()['decisiontreeclassifier__criterion'])
print('Best Max Depth:', gridsearch.best_estimator_.get_params()['decisiontreeclassifier__max_depth'])
print('Best Min Samples Split:', gridsearch.best_estimator_.get_params()['decisiontreeclassifier__min_samples_split'])


Best Criterion: gini
Best Max Depth: None
Best Min Samples Split: 2


### The default values of parameters came out to be best. 

## Evaluation of the model

In [11]:
train_preds = pipeline.predict(x_train)
test_preds = pipeline.predict(x_test)

In [12]:
print("MSE: {}".format(np.round(mean_squared_error(y_train, train_preds),2)))
print("MAE: {}".format(np.round(mean_absolute_error(y_train, train_preds),2)))
print("R-Squared: {}".format(np.round(r2_score(y_train, train_preds),2)))

MSE: 54374272576.42
MAE: 188687.2
R-Squared: -3.11


### Bad Performance of the Model