In [4]:
# Data manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
from matplotlib import style 
style.use("ggplot")
# Machine learning
import sklearn as sk

## 1. Data wrangling
### 1.1 Load data

In [5]:
# Load data
df = pd.read_csv("data/train.csv")

In [6]:
# Visualize
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 1.2 Clean data

In [8]:
df_copy = df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], axis = 1)
df_copy.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [9]:
df_copy.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [20]:
mean_age = df_copy["Age"].mean()
print("Mean age is: {}".format(mean_age))

Mean age is: 29.69911764705882


In [21]:
df_copy["Age"] = df_copy["Age"].fillna(mean_age)
df_copy["Age"].describe()

count    891.000000
mean      29.699118
std       13.002015
min        0.420000
25%       22.000000
50%       29.699118
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [22]:
df_copy = pd.get_dummies(df_copy)
df_copy.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0,3,22.0,1,0,7.25,0,1
1,1,1,38.0,1,0,71.2833,1,0
2,1,3,26.0,0,0,7.925,1,0
3,1,1,35.0,1,0,53.1,1,0
4,0,3,35.0,0,0,8.05,0,1


## 2. Exploratory data analysis

Age to Categorical
* age < 18 -> children
* age >= 18 && age < 45-> young adults
* age >= 45 && age < 55 -> old adults
* age > 55 -> elder

In [37]:
# Univariate
filtered_children = df_copy.query("Survived == 1 & Age < 18")
print("Children that survived: {}".format(filtered_children.shape[0]))
filtered_children_died = df_copy.query("Survived == 0 & Age < 18")
print("Children that not survived: {}".format(filtered_children_died.shape[0]))

filtered_young_adults = df_copy.query("Survived == 1 & Age >= 18 & Age < 45")
print("\nYoung adults that survived: {}".format(filtered_young_adults.shape[0]))
filtered_young_adults_died = df_copy.query("Survived == 0 & Age >= 18 & Age < 45")
print("Young adults that not survived: {}".format(filtered_young_adults_died.shape[0]))

filtered_adults = df_copy.query("Survived == 1 & Age >= 45 & Age < 55")
print("\nAdults that survived: {}".format(filtered_adults.shape[0]))
filtered_adults_died = df_copy.query("Survived == 0 & Age >= 45 & Age < 55")
print("Adults that not survived: {}".format(filtered_adults_died.shape[0]))

filtered_elder = df_copy.query("Survived == 1 & Age >= 55")
print("\nEldey that survived: {}".format(filtered_elder.shape[0]))
filtered_elder_died = df_copy.query("Survived == 0 & Age >= 55")
print("Elder that not survived: {}".format(filtered_elder_died.shape[0]))

Children that survived: 61
Children that not survived: 52

Young adults that survived: 238
Young adults that not survived: 425

Adults that survived: 30
Adults that not survived: 43

Eldey that survived: 13
Elder that not survived: 29


In [35]:
print("Children survival ratio: {}".format(filtered_children.shape[0]/filtered_children_died.shape[0]))
print("Adults survival ratio: {}".format(filtered_adults.shape[0]/filtered_adults_died.shape[0]))

Children survival ratio: 1.1730769230769231
Adults survival ratio: 0.6976744186046512


In [36]:
print("Children/adults survival ratio: {}".format(filtered_children.shape[0]/filtered_adults.shape[0]))

Children/adults survival ratio: 2.033333333333333


### Conclusions

In [38]:
df_copy.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0,3,22.0,1,0,7.25,0,1
1,1,1,38.0,1,0,71.2833,1,0
2,1,3,26.0,0,0,7.925,1,0
3,1,1,35.0,1,0,53.1,1,0
4,0,3,35.0,0,0,8.05,0,1


In [40]:
x = np.array(df_copy.drop(["Survived"], axis = 1))
y = np.array(df_copy["Survived"])
print(x.shape, y.shape)

(891, 7) (891,)


In [41]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)
# Get the score for the training set
score = clf.score(x, y)
print("Score in training set: {}".format(score))

Score in training set: 0.9820426487093153


### Submission

In [45]:
# Read test set
df_test_set = pd.read_csv("data/test.csv")
df_test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [46]:
# The table has all the variables, we don't need all of them because we 
# only used a few for our model
# Let's clean the variables we don't use
df_test_set = df_test_set.drop(["Ticket", "Name", "Embarked", "Cabin"],\
                             axis = 1)
df_test_set.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,male,34.5,0,0,7.8292
1,893,3,female,47.0,1,0,7.0
2,894,2,male,62.0,0,0,9.6875
3,895,3,male,27.0,0,0,8.6625
4,896,3,female,22.0,1,1,12.2875


In [47]:
# Clean the data
# For the age's values
mean_value_age = df_test_set["Age"].mean()
df_test_set["Age"] = df_test_set["Age"].fillna(mean_value_age)

# For the fare's values
mean_value_fare = df_test_set["Fare"].mean()
df_test_set["Fare"] = df_test_set["Fare"].fillna(mean_value_fare)

In [48]:
df_test_set.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,12.634534,0.89676,0.981429,55.8405
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,23.0,0.0,0.0,7.8958
50%,1100.5,3.0,30.27259,0.0,0.0,14.4542
75%,1204.75,3.0,35.75,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [49]:
# Let's give format to our data
df_test_set = pd.get_dummies(df_test_set)
df_test_set.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,892,3,34.5,0,0,7.8292,0,1
1,893,3,47.0,1,0,7.0,1,0
2,894,2,62.0,0,0,9.6875,0,1
3,895,3,27.0,0,0,8.6625,0,1
4,896,3,22.0,1,1,12.2875,1,0


In [50]:
# Open a file
submission_file = open("data/send_kaggle_file_v2.csv", "w")
submission_file.write("PassengerId,Survived\n")

# Now we can predict our test set
for i in range(df_test_set.shape[0]):
    # Let's locate one example
    example_test_set = df_test_set.drop(["PassengerId"], axis = 1).iloc[i]
    example_test_set_for_sklearn = np.array(example_test_set).reshape(1, -1)
    # Let's use the Machine Learning model
    classification_example_test_set = clf.predict(example_test_set_for_sklearn)
    print("**For the data \n{}, \n*we predict {}\n".format(example_test_set, classification_example_test_set))
    # Write the output file
    submission_file.write(str(df_test_set["PassengerId"].iloc[i]) \
                          + "," + str(np.squeeze(classification_example_test_set)) + "\n")
# Close the output file
submission_file.close()

**For the data 
Pclass         3.0000
Age           34.5000
SibSp          0.0000
Parch          0.0000
Fare           7.8292
Sex_female     0.0000
Sex_male       1.0000
Name: 0, dtype: float64, 
*we predict [0]

**For the data 
Pclass         3.0
Age           47.0
SibSp          1.0
Parch          0.0
Fare           7.0
Sex_female     1.0
Sex_male       0.0
Name: 1, dtype: float64, 
*we predict [0]

**For the data 
Pclass         2.0000
Age           62.0000
SibSp          0.0000
Parch          0.0000
Fare           9.6875
Sex_female     0.0000
Sex_male       1.0000
Name: 2, dtype: float64, 
*we predict [1]

**For the data 
Pclass         3.0000
Age           27.0000
SibSp          0.0000
Parch          0.0000
Fare           8.6625
Sex_female     0.0000
Sex_male       1.0000
Name: 3, dtype: float64, 
*we predict [1]

**For the data 
Pclass         3.0000
Age           22.0000
SibSp          1.0000
Parch          1.0000
Fare          12.2875
Sex_female     1.0000
Sex_male       0.0000

**For the data 
Pclass         1.0
Age           24.0
SibSp          1.0
Parch          0.0
Fare          60.0
Sex_female     0.0
Sex_male       1.0
Name: 50, dtype: float64, 
*we predict [0]

**For the data 
Pclass         2.0000
Age           27.0000
SibSp          0.0000
Parch          0.0000
Fare          15.0333
Sex_female     0.0000
Sex_male       1.0000
Name: 51, dtype: float64, 
*we predict [0]

**For the data 
Pclass         2.0
Age           20.0
SibSp          2.0
Parch          1.0
Fare          23.0
Sex_female     1.0
Sex_male       0.0
Name: 52, dtype: float64, 
*we predict [1]

**For the data 
Pclass          1.0
Age            28.0
SibSp           3.0
Parch           2.0
Fare          263.0
Sex_female      1.0
Sex_male        0.0
Name: 53, dtype: float64, 
*we predict [0]

**For the data 
Pclass         2.00000
Age           30.27259
SibSp          0.00000
Parch          0.00000
Fare          15.57920
Sex_female     0.00000
Sex_male       1.00000
Name: 54, dtype: float6


**For the data 
Pclass         3.000
Age           29.000
SibSp          0.000
Parch          0.000
Fare           7.925
Sex_female     0.000
Sex_male       1.000
Name: 97, dtype: float64, 
*we predict [1]

**For the data 
Pclass         3.0000
Age           20.0000
SibSp          0.0000
Parch          0.0000
Fare           7.8542
Sex_female     1.0000
Sex_male       0.0000
Name: 98, dtype: float64, 
*we predict [1]

**For the data 
Pclass         3.00
Age           33.00
SibSp          0.00
Parch          0.00
Fare           8.05
Sex_female     0.00
Sex_male       1.00
Name: 99, dtype: float64, 
*we predict [0]

**For the data 
Pclass         1.0000
Age           43.0000
SibSp          1.0000
Parch          0.0000
Fare          55.4417
Sex_female     1.0000
Sex_male       0.0000
Name: 100, dtype: float64, 
*we predict [1]

**For the data 
Pclass         2.0
Age           27.0
SibSp          1.0
Parch          0.0
Fare          26.0
Sex_female     0.0
Sex_male       1.0
Name: 101, dty


**For the data 
Pclass         2.0
Age            8.0
SibSp          0.0
Parch          2.0
Fare          32.5
Sex_female     0.0
Sex_male       1.0
Name: 194, dtype: float64, 
*we predict [1]

**For the data 
Pclass         3.0000
Age           33.0000
SibSp          0.0000
Parch          0.0000
Fare           7.8542
Sex_female     0.0000
Sex_male       1.0000
Name: 195, dtype: float64, 
*we predict [0]

**For the data 
Pclass          1.0
Age             6.0
SibSp           0.0
Parch           2.0
Fare          134.5
Sex_female      0.0
Sex_male        1.0
Name: 196, dtype: float64, 
*we predict [1]

**For the data 
Pclass         3.000
Age           18.000
SibSp          0.000
Parch          0.000
Fare           7.775
Sex_female     1.000
Sex_male       0.000
Name: 197, dtype: float64, 
*we predict [0]

**For the data 
Pclass         2.0
Age           23.0
SibSp          0.0
Parch          0.0
Fare          10.5
Sex_female     0.0
Sex_male       1.0
Name: 198, dtype: float64, 
*we 