# ใครรอดชีวิตจากเหตุการณ์เรือไททานิค?

**Credit:** http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets

* **pclass:** Passenger class (1 = first, 2 = second, 3 = third)
* **survived:** Survival (0 = no, 1 = yes)
* **name:** Name
* **sex:** Sex
* **age:** Age
* **sibsp:** Number of siblings/spouses aboard
* **parch:** Number of parents/chilren aboard
* **ticket:** Ticket number
* **fare:** Passenger fare
* **cabin:** Cabin
* **embarked:** Port of embarkation
* **boat:** Lifeboat
* **body:** Body identification number
* **home.dest:** Home/Destination

In [None]:
import pandas as pd

In [None]:
titanic_df = pd.read_csv('data/titanic.csv')

In [None]:
titanic_df.head(3)

In [None]:
titanic_df.shape

In [None]:
titanic_df.info()

In [None]:
titanic_df.describe()

### เปอร์เซนต์ของผู้โดยสารที่รอดชีวิตจากเรือไททานิคเป็นเท่าไหร่?

In [None]:
titanic_df['survived'].mean()

### ผู้โดยสารคลาสไหนรอดชีวิตเยอะสุด?

In [None]:
titanic_df.groupby('pclass').mean()

### ในแต่ละคลาสผู้โดยสารที่เป็นชายรอดเท่าไหร่ และที่เป็นหญิงรอดเท่าไหร่?

In [None]:
titanic_df.groupby(['pclass', 'sex']).mean()

### ลองนำข้อมูลมาพล็อต

In [None]:
%matplotlib inline

In [None]:
titanic_df.age.max()

In [None]:
titanic_df.age.min()

In [None]:
cut = pd.cut(titanic_df.age, [10, 20, 30, 40, 50, 60, 70, 80])
cut.head(10)

In [None]:
titanic_df.groupby(cut).mean()

In [None]:
avg_data_by_age = titanic_df.groupby(cut).mean()

In [None]:
avg_data_by_age.survived.plot(kind='bar')

In [None]:
import seaborn as sns

In [None]:
sns.distplot(titanic_df.age.dropna())

In [None]:
sns.barplot(x='sex', y='survived', hue='pclass', data=titanic_df)

In [None]:
sns.countplot(y='embarked', hue='pclass', data=titanic_df);

In [None]:
sns.countplot(x=titanic_df.survived, hue=titanic_df.sex)

In [None]:
sns.countplot(y=titanic_df.pclass, hue=titanic_df.survived)

In [None]:
import matplotlib.pyplot as plt

g = sns.FacetGrid(titanic_df, row='sex', col='survived')
g.map(plt.hist, 'age')

In [None]:
sns.boxplot(y='age', x='survived', data=titanic_df)

In [None]:
g = sns.FacetGrid(titanic_df, row='sex', col='pclass')
g.map(plt.hist, 'survived')

## เตรียมข้อมูลสำหรับสร้างโมเดล

In [None]:
titanic_df.info()

In [None]:
titanic_df.dropna(how='any').shape

In [None]:
titanic_df.drop(['boat', 'body', 'cabin', 'home.dest'], axis=1).dropna(how='any').shape

In [None]:
titanic_df.drop(['boat', 'body', 'cabin', 'home.dest'], axis=1).dropna(how='any').info()

In [None]:
titanic_df = titanic_df.drop(['name', 'ticket', 'boat', 'body', 'cabin', 'home.dest'], axis=1).dropna(how='any')

In [None]:
titanic_df.head()

In [None]:
from sklearn import preprocessing

le_for_sex = preprocessing.LabelEncoder()
le_for_sex.fit(titanic_df.sex)
titanic_df.sex = le_for_sex.transform(titanic_df.sex)

le_for_embarked = preprocessing.LabelEncoder()
le_for_embarked.fit(titanic_df.embarked)
titanic_df.embarked = le_for_embarked.transform(titanic_df.embarked)

In [None]:
titanic_df.head()

In [None]:
le_for_sex.classes_

In [None]:
le_for_embarked.classes_

In [None]:
le_for_sex.inverse_transform([0, 1, 1])

In [None]:
le_for_embarked.inverse_transform([0, 1, 2, 2])

In [None]:
titanic_df.head()

In [None]:
X = titanic_df.drop(['age', 'survived'], axis=1).values
y = titanic_df['survived'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# import
from sklearn.tree import DecisionTreeClassifier

# instantiate
dt = DecisionTreeClassifier(max_depth=10)

# fit
dt.fit(X_train, y_train)

# predict
y_pred_class = dt.predict(X_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
clf.score(X_test, y_test)

## Classification Metrics

In [None]:
from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.classification_report(y_test, y_pred_class, target_names=['no', 'yes']))

## Cross-Validation with Different Models

In [None]:
from sklearn.model_selection import cross_val_score

dt = DecisionTreeClassifier(max_depth=1)
scores = cross_val_score(dt, X, y, cv=10, scoring='accuracy')
print(scores.mean(), scores.std())

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
print(scores.mean(), scores.std())

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
scores = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
print(scores.mean(), scores.std())

In [None]:
import sklearn.ensemble as ske

In [None]:
rf = ske.RandomForestClassifier(n_estimators=50)
scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print(scores.mean(), scores.std())

In [None]:
gb = ske.GradientBoostingClassifier(n_estimators=50)
scores = cross_val_score(gb, X, y, cv=5, scoring='accuracy')
print(scores.mean(), scores.std())

## Challenge

เราสามารถทำให้ผลลัพธ์ดีขึ้นได้อย่างไรบ้าง?