# Scikit-learn packages

- sklearn is a package designed for machine learning
- function included: data preprocess, build model, compute metrics
- acceptable data types : numpy.array and pandas.DataFrame 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

This time we try to add preprocess steps before building models.

## Load Data

https://www.kaggle.com/c/titanic/data

In [None]:
train_dat = pd.read_csv('titanic/train.csv')
test_dat = pd.read_csv('titanic/test.csv')

print(train_dat.shape)
print(test_dat.shape)

In [None]:
train_dat.head()

In [None]:
train_dat.dtypes

In [None]:
full_dat = pd.concat([train_dat, test_dat], sort = False)
full_dat.reset_index(drop = True, inplace = True)

full_dat.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis = 1, inplace = True)

## Preprocess
- missing imputation
- one-hot / label encoding
- normalization

### missing imputation

In [None]:
full_dat.isna().sum()

In [None]:
# missing imputation

full_dat['Age'].fillna(full_dat['Age'].median(), inplace = True)
full_dat['Embarked'].fillna(full_dat['Embarked'].mode()[0], inplace = True)
full_dat['Fare'].fillna(full_dat['Fare'].median(), inplace = True)

In [None]:
full_dat.isna().sum()

### one-hot encoding

In [None]:
# one-hot encoding

one_hot_dat = pd.get_dummies(full_dat, columns = ['Pclass','Sex','Embarked'])
one_hot_dat.head()

In [None]:
# from sklearn.preprocessing import OneHotEncoder (not recommend)

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

fdat_cp = full_dat.copy()

for col in ['Pclass','Sex','Embarked']:
    fdat_cp[col] = LabelEncoder().fit_transform(fdat_cp[col])


In [None]:
one_hot_columns = OneHotEncoder(sparse = False).fit_transform(fdat_cp[['Pclass','Sex','Embarked']])

one_hot_columns[:5,:]

In [None]:
fdat_cp[['Pclass','Sex','Embarked']].head()

### normalization

In [None]:
# normalization
from sklearn.preprocessing import StandardScaler

std_s = StandardScaler()

survived_ = one_hot_dat['Survived']
one_hot_dat.drop('Survived', axis = 1, inplace = True)

normalize_dat = std_s.fit_transform(one_hot_dat)

In [None]:
print(type(normalize_dat))
print(normalize_dat.shape)

In [None]:
normalize_dat.sum(axis = 0)

## Train_test_split

In [None]:
from sklearn.model_selection import train_test_split

test_index = survived_.isna()

train_x = normalize_dat[~test_index]
test_x = normalize_dat[test_index]
train_y = survived_[~test_index]

In [None]:
t_x, v_x, t_y, v_y = train_test_split(train_x, train_y, test_size = 0.2, shuffle = True, random_state = 412)

print(t_x.shape)
print(v_x.shape)

## Build model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(t_x, t_y)

In [None]:
print('training score (decision tree : {:.3f}'.format(dt_model.score(t_x, t_y)))
print('validation score (decision tree : {:.3f}'.format(dt_model.score(v_x, v_y)))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true = v_y, y_pred = dt_model.predict(v_x))

In [None]:
test_prediction = dt_model.predict(test_x)
test_dat['Survived'] = test_prediction

In [None]:
test_dat.head()

In [None]:
test_dat.to_csv('titanic_prediction.csv', index = False)

---

## Practice

Predict which one would survived using other machine learning models, evaluate the result to see if it's better than decision tree ?

In [None]:
# your code starts from here

---

## Supervise learning 2.0

After the exmaple and practice, you should be able to
- implement basic data preprocessing
- compare and evaluate the result of different models
