In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()

# Data Preprocessing

* Extract Prefix from Name


Prefix can help us check a person's age and marriage. For example, "Mrs." represent that this lady is married, "Master" is the prefix for young boy. These important messages are hidden under the variable 'Name'.

We can find that all the data in "Name" follow the format below:

| Last Name | Prefix | First Name |
| --- | --- | --- |
| Braund | Mr. | Harris|

In [None]:
# seperate Last Name, Prefix, and First Name
Name = train['Name']
last = []
rest = []
for i in range(len(Name)):
    l = Name[i].split(',')[0]
    last.append(l)
    r = Name[i].split(',')[1]
    rest.append(r)

prefix = []
first = []
for i in range(len(Name)):
    f = rest[i].split('.')[1]
    first.append(f)
    p = rest[i].split('.')[0]
    prefix.append(p)

train['Prefix'] = prefix
train['Last Name'] = last
train['First Name'] = first
train.head()

* Missing Values Imputation

In [None]:
train.isnull().sum()

In [None]:
# Distribution of 'Embarked'
train['Embarked'].value_counts()

In [None]:
# Distribution of 'Age'
train['Age'].fillna('NA', inplace = True)
pd.crosstab(train['Age'], train['Prefix']).iloc[-1, :]

I would drop "Cabin" because around three of four values in the variable are missing. 

"Embarked" is a catigorical variable with three outcomes (S, C, Q) and I used mode to fill the missing values.

In [None]:
train['Embarked'].fillna('S', inplace = True)


Next, I filled 'Age' by following methods:

Prefix | Imputation | Category
--- | --- | ---
Master | Mean age of other Master | Young boys
Miss with Parch $\neq$ 0 | Mean age of other Miss with Parch $\neq$ 0 | Young girls
Miss with Parch = 0 | Mean age of the other Miss  | Young lady
Mrs | Mean age of other Mrs | Adult female
Others | Mean age of other Mr | Adult male

In [None]:
# 'Age' Imputation
nomiss = train[train['Age'] != 'NA']
master_age = round(nomiss[nomiss['Prefix']==' Master'].Age.mean())
miss1_age = round(nomiss[(nomiss['Prefix']==' Miss') & (nomiss['Parch']!=0)].Age.mean())
miss2_age = round(nomiss[(nomiss['Prefix']==' Miss') & (nomiss['Parch']==0)].Age.mean())
mrs_age = round(nomiss[nomiss['Prefix']==' Mrs'].Age.mean())
mr_age = round(nomiss[(nomiss['Prefix']!=' Master') & (nomiss['Sex']=='male')].Age.mean())

for i in range(train.shape[0]):
    if (train['Age'][i] == 'NA'):
        if (train['Prefix'][i]==' Master'):
            train['Age'][i] = master_age
        elif ((train['Prefix'][i]==' Miss') & (train['Parch'][i]!=0)):
            train['Age'][i] = miss1_age
        elif ((train['Prefix'][i]==' Miss') & (train['Parch'][i]==0)):
            train['Age'][i] = miss2_age
        elif (train['Prefix'][i]==' Mrs'):
            train['Age'][i] = mrs_age
        else:
            train['Age'][i] = mr_age


In [None]:
train.drop('Cabin', axis = 1, inplace = True)
train.isnull().sum()

In [None]:
# data cleaning funciton
def cleaning(df):
    Name = df['Name']
    last = []
    rest = []
    for i in range(len(Name)):
        l = Name[i].split(',')[0]
        last.append(l)
        r = Name[i].split(',')[1]
        rest.append(r)
    prefix = []
    first = []
    for i in range(len(Name)):
        f = rest[i].split('.')[1]
        first.append(f)
        p = rest[i].split('.')[0]
        prefix.append(p)

    df['Prefix'] = prefix
    df['Last Name'] = last
    df['First Name'] = first
    df['Embarked'].fillna('S', inplace = True)
    nomiss = train[train['Age'] != 'NA']
    master_age = round(nomiss[nomiss['Prefix']==' Master'].Age.mean())
    miss1_age = round(nomiss[(nomiss['Prefix']==' Miss') & (nomiss['Parch']!=0)].Age.mean())
    miss2_age = round(nomiss[(nomiss['Prefix']==' Miss') & (nomiss['Parch']==0)].Age.mean())
    mrs_age = round(nomiss[nomiss['Prefix']==' Mrs'].Age.mean())
    mr_age = round(nomiss[(nomiss['Prefix']!=' Master') & (nomiss['Sex']=='male')].Age.mean())
    df['Age'].fillna('NA', inplace = True)
    for i in range(df.shape[0]):
        if (df['Age'][i] == 'NA'):
            if (df['Prefix'][i]==' Master'):
                df['Age'][i] = master_age
            elif ((df['Prefix'][i]==' Miss') & (train['Parch'][i]!=0)):
                df['Age'][i] = miss1_age
            elif ((df['Prefix'][i]==' Miss') & (train['Parch'][i]==0)):
                df['Age'][i] = miss2_age
            elif (df['Prefix'][i]==' Mrs'):
                df['Age'][i] = mrs_age
            else:
                df['Age'][i] = mr_age
    test_clean = df.drop('Cabin', axis = 1)
    return(test_clean)

# Data Exploratory

When the accident happened, managers followed some criteria to select people to aboard the limited lifeboats. They had to make the decision very quickly since they had to race against time, and, therefore, we might not need that many variables. 

In the begining, I defined a new variable 'Family Size', which is the sum of 'Parch' and 'SibSp'.  

In [None]:
train['Family Size'] = train.SibSp + train.Parch

'PassengerId', 'Name', and 'Ticket' are different from each person, so there are no information to show in these variables. I checked the correlation between 'Survived' and other variables and got the following information:

In [None]:
train.drop(['Name', 'Ticket', 'Last Name', 'First Name'], axis = 1, inplace = True)

* Female have way higher survival probaility (74%) than male (19%).

It showed that women might have the priority to left and only a few men are allowed to board. That is, the decision criterion might be different between female and male, and we might need to ****seperate female and male into two groups** and **train different model** for each group.

In [None]:
pd.crosstab(train['Survived'], train['Sex'], normalize = 'columns')

In [None]:
# Seperate male and female
Male = train[train['Sex']=='male']
Male.reset_index(drop = True, inplace = True)
Female = train[train['Sex']=='female']
Female.reset_index(drop = True, inplace = True)

### Male

Most males were died in this accidnets, so my object is to figure out what made those men alive.

1. The following table showed that the survival probability of 'Master' is more than 50%

In [None]:
pd.crosstab(Male['Survived'], Male['Prefix'])

2. Interestingly, all the young boys in class 1 and class 2 survived.

In [None]:
Master = Male[Male['Prefix']==' Master']
pd.crosstab(Master['Survived'], Master['Pclass'], normalize = 'columns')

3. Only 1 person alive when the family size is greater than 3.

In [None]:
pd.crosstab(Male['Survived'], Male['Family Size'])

In [None]:
pd.crosstab(Male['Survived'], Male['Pclass'])

### Female

Oppsite to male, we have to figure out the reason why they were not selected.

1. Women in class 1 and class 2 had more than 90% survival probability. 

Obviously women in class 1 and class 2 had the priotiry to aboard lifeboats so I seperated felames into 'class 3' and 'class 1&2'.

In [None]:
f_c3 = Female[Female['Pclass'] == 3]
f_c12 = Female[Female['Pclass'] != 3]
pd.crosstab(Female['Survived'], Female['Pclass'], normalize = 'columns')

2. Most of victims embarked from port S.

In [None]:
pd.crosstab(f_c3['Survived'], f_c3['Embarked'])


In [None]:
pd.crosstab(f_c12['Survived'], f_c12['Embarked'])

In [None]:
pd.crosstab(f_c12['Survived'], f_c12['Family Size'])

In [None]:
pd.crosstab(f_c3['Survived'], f_c3['Family Size'])

# Feature Seletction and Model Training

Frome the last step, I divided males and femals into two groups found that the major factors related to survival are different. Here I followed the result and build model for each group.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

* **Male**

The key factors are 'Prefix', 'Pclass', and 'Family Size'.

In [None]:
# modify Prefix
Male['Master'] = Male['Age']
for i in range(Male.shape[0]):
    if (Male['Prefix'][i] ==' Master'):
        Male['Master'][i] = 1
    else:
        Male['Master'][i] = 0
# dummy
pc = pd.get_dummies(Male['Pclass'], drop_first=True)
xx_male = pd.concat([Male, pc], axis = 1)
# variable selection
x_male = xx_male[['Family Size','Master', 2, 3]]
y_male = xx_male[['Survived']].to_numpy()
# fit model
DT_male = DecisionTreeClassifier(criterion='entropy').fit(x_male, y_male)

* Female

Since the survival probability of class 1 and class 2 is more than 90%, I just guess women in these classes were survived.

The major factors of survival in class 3 are 'Family Size', 'Fare', 'Child', and 'Embarked'.



In [None]:
# define young girl
Female['Child'] = Female['Prefix']
for i in range(Female.shape[0]):
    if (Female['Age'][i] <= 10):
        Female['Child'][i] = 1
    else:
        Female['Child'][i] = 0
# class 3
f3 = Female[Female['Pclass']==3]
f3.reset_index(drop=True, inplace = True)
# dummy
embark = pd.get_dummies(f3['Embarked'], drop_first=True)
ff3 = pd.concat([f3, embark], axis = 1)
# variable selection
x_female = ff3[['Family Size', 'Fare', 'Child', 'Q', 'S']]
y_female = ff3[['Survived']].to_numpy()
DT_female = KNeighborsClassifier(1).fit(x_female, y_female)

I build the model by KNN with k = 1 (see the figure below).

In [None]:
acc=np.zeros(10)
for n in range(1,11):
    knn = KNeighborsClassifier(n).fit(x_female, y_female)
    yhat = knn.predict(x_female)
    acc[n-1] = accuracy_score(y_female, yhat)
import matplotlib.pyplot as plt
plt.plot(range(1,11), acc, 'g')


In [None]:
# data seperation - male
def male(df):
    Male = df[df['Sex']=='male']
    Male.reset_index(drop = True, inplace = True)
    Male['Master'] = Male['Age']
    for i in range(Male.shape[0]):
        if (Male['Prefix'][i] ==' Master'):
            Male['Master'][i] = 1
        else:
            Male['Master'][i] = 0
    pc = pd.get_dummies(Male['Pclass'], drop_first=True)
    xx_male = pd.concat([Male, pc], axis = 1)
    return(xx_male)

In [None]:
# data seperation - female
def female(df):
    Female = df[df['Sex']=='female']
    Female.reset_index(drop = True, inplace = True)
    Female['Child'] = Female['Prefix']
    for i in range(Female.shape[0]):
        if (Female['Age'][i] <= 10):
            Female['Child'][i] = 1
        else:
            Female['Child'][i] = 0
    f3 = Female[Female['Pclass']==3]
    f3.reset_index(drop=True, inplace = True)
    embark = pd.get_dummies(f3['Embarked'], drop_first=True)
    ff3 = pd.concat([f3, embark], axis = 1)
    return(ff3)

# Evaluation

In [None]:
i=0
acc = []
while(i<1000):
    x_train, x_test, y_train, y_test = train_test_split(x_male,y_male, test_size=0.4)
    
    #DT = DecisionTreeClassifier(criterion='entropy').fit(x_train, y_train)
    DT = DecisionTreeClassifier(criterion='entropy').fit(x_train, y_train)
    yhat = DT.predict(x_test)    
    score = accuracy_score(y_test, yhat)
    acc.append(score)
    i=i+1
    
avg_acc = sum(acc)/len(acc)

print('The accuracy of Decision Tree mode in Male is', avg_acc)

In [None]:
i=0
acc = []
while(i<1000):
    x_train, x_test, y_train, y_test = train_test_split(x_female,y_female, test_size=0.1)
    DT = KNeighborsClassifier(1).fit(x_train, y_train)
    #DT = DecisionTreeClassifier(criterion='entropy').fit(x_train, y_train)
    yhat = DT.predict(x_test)    
    score = accuracy_score(y_test, yhat)
    acc.append(score)
    i=i+1

print('The accuracy of model in Female is', sum(acc)/len(acc))

In [None]:
# male
yhat_male = DT_male.predict(x_male)
#accuracy_score(y_male, yhat_male)
out_male = pd.DataFrame({'Yhat': yhat_male})
result_male = pd.concat([xx_male, out_male], axis = 1)
# female in c3
yhat_f3 = DT_female.predict(x_female)
out_female = pd.DataFrame({'Yhat': yhat_f3})
result_female = pd.concat([f3, out_female], axis = 1)
# female in c1, c2
f12 = Female[Female['Pclass']!=3]
f12['Yhat'] = 1

In [None]:
result_all = pd.concat([result_male, result_female, f12], axis = 0)
accuracy_score(result_all['Survived'], result_all['Yhat'])

# Test result

In [None]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test['Family Size'] = test.SibSp + test.Parch
test.isnull().sum()

In [None]:
fare_mean = test['Fare'].mean()
test['Fare'].fillna(fare_mean, inplace = True)
test_clean = cleaning(test)
test_clean.isnull().sum()

In [None]:
test_male = male(test_clean)
test_f3 = female(test_clean) 

In [None]:
# prediction - c12
test_f12 = test_clean[(test_clean['Sex']=='female')&(test_clean['Pclass']!=3)]
test_f12['Survived'] = 1
# prediction - male
fit_male = test_male[['Family Size','Master', 2, 3]]
test_male['Survived'] = DT_male.predict(fit_male)
# prediction - c3
fit_f3 = test_f3[['Family Size', 'Fare', 'Child', 'Q', 'S']]
test_f3['Survived'] = DT_female.predict(fit_f3)

In [None]:
test_result = pd.concat([test_male, test_f12, test_f3], axis = 0)


In [None]:
out = test_result[['PassengerId', 'Survived']]
out.sort_values(by = 'PassengerId', inplace = True)
out.reset_index(drop = True, inplace = True)

In [None]:
out.to_csv('outcome.csv',index=False)
out