# Scikit-learn Model tuning

In this tutorial we try to refine the decision tree model with parameter tuning.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# read data
train_dat = pd.read_csv('titanic/train.csv')
test_dat = pd.read_csv('titanic/test.csv')

full_dat = pd.concat([train_dat, test_dat], sort = False)
full_dat.reset_index(drop = True, inplace = True)


# drop columns---#
full_dat.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis = 1, inplace = True)


# missing imputation---#
full_dat['Age'].fillna(full_dat['Age'].median(), inplace = True)
full_dat['Embarked'].fillna(full_dat['Embarked'].mode()[0], inplace = True)
full_dat['Fare'].fillna(full_dat['Fare'].median(), inplace = True)


#one-hot encoding---#
one_hot_dat = pd.get_dummies(full_dat, columns = ['Pclass','Sex','Embarked'])
one_hot_dat.head()


#normalization---#
std_s = StandardScaler()

survived_ = one_hot_dat['Survived']
one_hot_dat.drop('Survived', axis = 1, inplace = True)

normalize_dat = std_s.fit_transform(one_hot_dat)


#train test split---#
test_index = survived_.isna()

train_x = normalize_dat[~test_index]
test_x = normalize_dat[test_index]
train_y = survived_[~test_index]

t_x, v_x, t_y, v_y = train_test_split(train_x, train_y, test_size = 0.2, shuffle = True, random_state = 412)


## Grid Search with cross validation

We need to pre-specify what we want to test.

In [None]:
para_dict = {'max_depth':[1,3,5,7,10]}

model = DecisionTreeClassifier()

dt_cv = GridSearchCV(model, para_dict, cv = 5)
result = dt_cv.fit(t_x, t_y)

In [None]:
dt_cv.best_params_

In [None]:
dt_cv.grid_scores_

In [None]:
dt_model = DecisionTreeClassifier(max_depth = 7)
dt_model.fit(t_x, t_y)

print('training score (decision tree : {:.3f}'.format(dt_model.score(t_x, t_y)))
print('validation score (decision tree : {:.3f}'.format(dt_model.score(v_x, v_y)))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true = v_y, y_pred = dt_model.predict(v_x))

---

## Supervise learning 3.0

After the exmaple and practice, you should be able to
- select model parameters with gridsearch and cross-validation