# Hands on Machine learning
- [10 Minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html)
- [scikit-learn Tutorials](https://scikit-learn.org/stable/tutorial/index.html)

## Import packages

In [None]:
#packages for data manipulation
import pandas as pd
import numpy as np

#packages for machine learning model
from sklearn import preprocessing, model_selection, tree, metrics

#packages for data visualization
import matplotlib.pyplot as plt
%matplotlib inline

## Step 0 : Load data

In [None]:
train_dat = pd.read_csv('titanic_train.csv')
test_dat = pd.read_csv('titanic_test.csv')

In [None]:
# top 5 dat
train_dat.head()

In [None]:
train_dat.shape

---

## Step 1 : Data exploration

In [None]:
train_dat.info()

### Survived

In [None]:
train_dat.Survived.value_counts()

In [None]:
train_dat.Survived.value_counts().plot.bar()

### --Exercise-- 
- visualize column 'Pclass' in titanic data using bar-plot

### Fare

In [None]:
train_dat.Fare.hist()

### --Exercise--
- visualize column 'Age' in titanic data using histogram

### Survival rate v.s. Port Embarked 

In [None]:
train_dat.groupby('Embarked').Survived.mean()

In [None]:
train_dat.groupby('Embarked').Survived.mean().plot.bar()

### Survived v.s. Age

In [None]:
train_dat.boxplot('Age','Survived')

### --Exercise--
- try to explore relationships between other features and survived in titanic data.

### Summarize what we've found
- missing value : Age, Embarked, Cabin, Pclass
- outlier : Fare
- categorical variables : Embarked, Sex, Ticket, Name, PassengerId

---

## Step 2 : Preprocessing

### Outlier handling
- drop rows containing outlier
- drop columns with outlier
- clipping value
- transform distribution

In [None]:
train_dat.Fare.describe()

In [None]:
# set fare as 200.0 if fare is greater than 200.0

train_dat.loc[train_dat.Fare>200.0,'Fare'] = 200.0

### Missing data handling
- drop rows containing missing data
- drop columns with missing data
- missing data imputation

In [None]:
# impute age with median of age
age_median = train_dat.Age.median()
train_dat['Age'] = train_dat['Age'].fillna(age_median)

# impute Embarked port with mode of Embarked
embarked_mode = train_dat.Embarked.mode()[0]
train_dat['Embarked'] = train_dat['Embarked'].fillna(embarked_mode)

# impute Pclass with mode of Pclass
Pclass_mode = train_dat.Pclass.mode()[0]
train_dat['Pclass'] = train_dat['Pclass'].fillna(Pclass_mode)

# drop cabin
train_dat = train_dat.drop('Cabin', axis = 1)

### Categorical feature encoding
- label encoding
- one-hot encoding
- drop column

In [None]:
# Sex
train_dat['Sex'] = train_dat['Sex'].map({'female':1, 'male':2})

# Embarked
train_dat['Embarked'] = train_dat['Embarked'].map({'C':0,'Q':1,'S':2})

# PassengerId, Ticket, Name
train_dat = train_dat.drop(['Ticket','PassengerId','Name'], axis = 1)

In [None]:
train_dat.head()

In [None]:
# the same preprocessing procedure should be done in testing data

test_dat['Age'] = test_dat['Age'].fillna(age_median)
test_dat['Embarked'] = test_dat['Embarked'].fillna(embarked_mode)
test_dat = test_dat.drop('Cabin', axis = 1)

test_dat['Sex'] = test_dat['Sex'].map({'female':1, 'male':2})
test_dat['Embarked'] = test_dat['Embarked'].map({'C':0,'Q':1,'S':2})
test_dat = test_dat.drop(['Ticket','PassengerId','Name'], axis = 1)

In [None]:
test_dat.head()

In [None]:
train_dat.to_csv('train_dat_processed.csv', index = False)
test_dat.to_csv('test_dat_processed.csv', index = False)

### We are ready to step into next step ->>>

---

## Step 3 : Train-test split

In [None]:
train_set, valid_set = model_selection.train_test_split(train_dat, test_size = 0.2, shuffle = True, random_state = 629)

train_x = train_set.drop(['Survived'], axis = 1)
train_y = train_set['Survived']

valid_x = valid_set.drop(['Survived'], axis = 1)
valid_y = valid_set['Survived']

---

## Step 4 : Build model

In [None]:
model = tree.DecisionTreeClassifier()

model.fit(train_x, train_y)

---

## Step 5 : Model Evaluation

In [None]:
train_prediction = model.predict(train_x)

cm = metrics.confusion_matrix(y_true = train_y, y_pred = train_prediction)
acc = metrics.accuracy_score(y_true = train_y, y_pred = train_prediction)

print(' --training set--\r\n')
print(cm)
print('accuracy score : {}'.format(acc))

In [None]:
valid_prediction = model.predict(valid_x)

cm = metrics.confusion_matrix(y_true = valid_y, y_pred = valid_prediction)
acc = metrics.accuracy_score(y_true = valid_y, y_pred = valid_prediction)

print(' --validation set--\r\n')
print(cm)
print('accuracy score : {}'.format(acc))

---

## Step 6: Find a better model

In [None]:
## refine model

model = tree.DecisionTreeClassifier(min_samples_split = 6)

model.fit(train_x, train_y)

train_prediction = model.predict(train_x)

cm = metrics.confusion_matrix(y_true = train_y, y_pred = train_prediction)
acc = metrics.accuracy_score(y_true = train_y, y_pred = train_prediction)

print(' --training set--\r\n')
print(cm)
print('accuracy score : {}'.format(acc))

valid_prediction = model.predict(valid_x)

cm = metrics.confusion_matrix(y_true = valid_y, y_pred = valid_prediction)
acc = metrics.accuracy_score(y_true = valid_y, y_pred = valid_prediction)

print(' --validation set--\r\n')
print(cm)
print('accuracy score : {}'.format(acc))

In [None]:
from sklearn import ensemble

model = ensemble.RandomForestClassifier(n_estimators = 100, min_samples_split=5)

model.fit(train_x, train_y)



In [None]:

train_prediction = model.predict(train_x)

cm = metrics.confusion_matrix(y_true = train_y, y_pred = train_prediction)
acc = metrics.accuracy_score(y_true = train_y, y_pred = train_prediction)

print(' --training set--\r\n')
print(cm)
print('accuracy score : {}'.format(acc))

In [None]:

valid_prediction = model.predict(valid_x)

cm = metrics.confusion_matrix(y_true = valid_y, y_pred = valid_prediction)
acc = metrics.accuracy_score(y_true = valid_y, y_pred = valid_prediction)

print(' --validation set--\r\n')
print(cm)
print('accuracy score : {}'.format(acc))

---

## Something that you can try after the class
- explore data thoroughly
- create more feature based on existing features
- build different kinds of ML models