# kaggle Titanic Python/Pandas/DecisionTree

Predicting who will survive to Titanic

## Libraries

In [58]:
##set path for multiple versions
import sys
sys.path.insert(0,'/home/atom/.local/lib/python3.6/site-packages')
import pandas as pd 
import numpy as np
import re
from sklearn.tree import DecisionTreeClassifier
#To ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Data

In [59]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
all_data = [train_data, test_data]

In [60]:
# Descriptions

#PassengerId : int     : Id
#Survived    : int     : Survival (0=No; 1=Yes)
#Pclass      : int     : Passenger Class
#Name        : object  : Name
#Sex         : object  : Sex
#Age         : float   : Age
#SibSp       : int     : Number of Siblings/Spouses Aboard
#Parch       : int     : Number of Parents/Children Aboard
#Ticket      : object  : Ticket Number
#Fare        : float   : Passenger Fare
#Cabin       : object  : Cabin
#Embarked    : object  : Port of Embarkation
#                        (C=Cherbourg; Q=Queenstown; S=Southampton)

In [61]:
#look at DataSets
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Feature Engineering

In [62]:
# Pclass

## Mean of survivors on each Pclass
print( train_data[["Pclass","Survived"]].groupby(["Pclass"], as_index = False).mean() )

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [63]:
# Sex
## Mean of survivors on each sex
print( train_data[["Sex","Survived"]].groupby(["Sex"], as_index = False).mean() )

      Sex  Survived
0  female  0.742038
1    male  0.188908


In [64]:
# Family Size

## new feature, person + Siblings/Spouses + Parents/Children to determinate the size of family

for data in all_data:
    data['family_size'] = data['SibSp'] + data['Parch'] + 1
print( train_data[["family_size","Survived"]].groupby(["family_size"], as_index = False).mean() )

# Check if itś alone
print('is alone')
passenger_id = train_data['PassengerId']
for data in all_data:
    data['is_alone'] = 0
    data.loc[data['family_size'] == 1, 'is_alone'] = 1
print (train_data[['is_alone', 'Survived']].groupby(['is_alone'], as_index=False).mean() )

   family_size  Survived
0            1  0.303538
1            2  0.552795
2            3  0.578431
3            4  0.724138
4            5  0.200000
5            6  0.136364
6            7  0.333333
7            8  0.000000
8           11  0.000000
is alone
   is_alone  Survived
0         0  0.505650
1         1  0.303538


In [65]:
# Embarked

## Replacing NA with 'S' because is the most common
for data in all_data:
    data['Embarked'] = data['Embarked'].fillna('S')
print( train_data[["Embarked","Survived"]].groupby(["Embarked"], as_index = False).mean() )

  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


In [66]:
# Fare

## using qcut to deal with NA
for data in all_data:
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())
train_data['category_fare'] = pd.qcut(train_data['Fare'], 4)
print( train_data[["category_fare","Survived"]].groupby(["category_fare"], as_index = False).mean() )

     category_fare  Survived
0   (-0.001, 7.91]  0.197309
1   (7.91, 14.454]  0.303571
2   (14.454, 31.0]  0.454955
3  (31.0, 512.329]  0.581081


In [67]:
# Age

# Using mean and std
for data in all_data:
    age_avg  = data['Age'].mean()
    age_std  = data['Age'].std()
    age_null = data['Age'].isnull().sum()
    
    # Create a list of the size of the sum of null values with randon values based on mean and standard deviation
    random_list = np.random.randint(age_avg - age_std, age_avg + age_std , size = age_null)
    # fetching with randon values
    data['Age'][pd.isna(data['Age'])] = random_list
    data['Age'] = data['Age'].astype(int)

train_data['category_age'] = pd.cut(train_data['Age'], 5)
print( train_data[["category_age","Survived"]].groupby(["category_age"], as_index = False).mean() )

    category_age  Survived
0  (-0.08, 16.0]  0.527778
1   (16.0, 32.0]  0.359202
2   (32.0, 48.0]  0.365079
3   (48.0, 64.0]  0.434783
4   (64.0, 80.0]  0.090909


In [68]:
# Name

## Using the title to classify
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\. ', name)
    if title_search:
        return title_search.group(1)
    return ""

for data in all_data:
    data['title'] = data['Name'].apply(get_title)

for data in all_data:
    data['title'] = data['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],'Rare')
    data['title'] = data['title'].replace('Mlle','Miss')
    data['title'] = data['title'].replace('Ms','Miss')
    data['title'] = data['title'].replace('Mme','Mrs')
    
print(pd.crosstab(train_data['title'], train_data['Sex']))
print("----------------------")
print(train_data[['title','Survived']].groupby(['title'], as_index = False).mean())

Sex     female  male
title               
Master       0    40
Miss       185     0
Mr           0   517
Mrs        126     0
Rare         3    20
----------------------
    title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826


## Mapping Data

In [69]:
for data in all_data:

    #Mapping Sex
    sex_map = { 'female':0 , 'male':1 }
    data['Sex'] = data['Sex'].map(sex_map).astype(int)

    #Mapping Title
    title_map = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    data['title'] = data['title'].map(title_map)
    data['title'] = data['title'].fillna(0)

    #Mapping Embarked
    embark_map = {'S':0, 'C':1, 'Q':2}
    data['Embarked'] = data['Embarked'].map(embark_map).astype(int)

    #Mapping Fare
    data.loc[ data['Fare'] <= 7.91, 'Fare']                            = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
    data.loc[ data['Fare'] > 31, 'Fare']                               = 3
    data['Fare'] = data['Fare'].astype(int)

    #Mapping Age
    data.loc[ data['Age'] <= 16, 'Age']                       = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age']                        = 4

#Feature Selection
#Create list of columns to drop
drop_elements = ["Name", "Ticket", "Cabin", "SibSp", "Parch", "family_size"]

#Drop columns from both data sets
train_data = train_data.drop(drop_elements, axis = 1)
train_data = train_data.drop(['PassengerId','category_fare', 'category_age'], axis = 1)
test_data = test_data.drop(drop_elements, axis = 1)

#Print ready to use data
print(train_data.head(10))

   Survived  Pclass  Sex  Age  Fare  Embarked  is_alone  title
0         0       3    1    1     0         0         0      1
1         1       1    0    2     3         1         0      3
2         1       3    0    1     1         0         1      2
3         1       1    0    2     3         0         0      3
4         0       3    1    2     1         0         1      1
5         0       3    1    1     1         2         1      1
6         0       1    1    3     3         0         1      1
7         0       3    1    0     2         0         0      4
8         1       3    0    1     1         0         0      3
9         1       2    0    0     2         1         0      3


## Prediction

In [70]:
# Preparing the DataSet's

# Train and Test data
X_train = train_data.drop("Survived", axis=1)
Y_train = train_data["Survived"]
X_test = test_data.drop("PassengerId", axis=1).copy()

In [71]:
# Running our classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
accuracy = round(decision_tree.score(X_train, Y_train) * 100, 2)
print("Model Accuracy: ",accuracy)

Model Accuracy:  87.09


## Creating a CSV to submit

In [72]:
#Create a CSV with results

submission = pd.DataFrame({
    "PassengerId":test_data["PassengerId"],
    "Survived": Y_pred
})
submission.to_csv('submission.csv', index = False)