In [45]:
import pandas as pd
import numpy as np

In [46]:
train_data = pd.read_csv('data/train.csv')

In [47]:
test_data = pd.read_csv('data/test.csv')
gs_data = pd.read_csv('data/gender_submission.csv')


In [48]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [49]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [50]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [51]:
train_data.shape

(891, 12)

In [52]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [53]:
train_data['Age'].count()

714

In [54]:
#let's see unique values in Embarked
np.unique(train_data['Pclass'])

array([1, 2, 3], dtype=int64)

In [55]:
def unique_str (list):
    u_list = []
    for x in list:
        if x not in u_list:
            u_list.append(x)
    return u_list

In [56]:
unique_str(train_data['Embarked'])

['S', 'C', 'Q', nan]

In [57]:
#let's try it out with a simple model, without droping out na
from sklearn.tree import DecisionTreeRegressor
titanic_dtr_model = DecisionTreeRegressor(random_state=1)

In [58]:
#set up target and features
y = train_data.Survived
X = train_data.loc[:, train_data.columns != 'Survived']

In [59]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [60]:
#let's train the model
titanic_dtr_model.fit(X,y)

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

In [61]:
#okay string does work with ml models, let's drop them
X = X.drop('Name',axis=1)

In [62]:
X.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [63]:
#can I drop columns that are not integer?
X = X.select_dtypes(exclude='object')

In [64]:
#this is a very rough practice, now we have a all number dataframe, let's run the model
titanic_dtr_model.fit(X,y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [65]:
#okay, let's drop na
X.dropna()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.2500
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.9250
3,4,1,35.0,1,0,53.1000
4,5,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
885,886,3,39.0,0,5,29.1250
886,887,2,27.0,0,0,13.0000
887,888,1,19.0,0,0,30.0000
889,890,1,26.0,0,0,30.0000


In [66]:
X.isnull().sum()

PassengerId      0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
dtype: int64

In [67]:
X = X.dropna()

In [68]:
#clear of NAs, let's run it again
titanic_dtr_model.fit(X,y)

ValueError: Number of labels=891 does not match number of samples=714

In [106]:
#okay, shouldnt do all the manipulations on feature data, X and y have different sizes, let's restart
train_data = train_data.dropna()
train_data = train_data.select_dtypes(exclude = 'object')

In [107]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
1,2,1,1,38.0,1,0,71.2833
3,4,1,1,35.0,1,0,53.1
6,7,0,1,54.0,0,0,51.8625
10,11,1,3,4.0,1,1,16.7
11,12,1,1,58.0,0,0,26.55


In [108]:
y = train_data.Survived
X = train_data.loc[:, train_data.columns != 'Survived']

In [109]:
titanic_dtr_model.fit(X,y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1, splitter='best')

In [110]:
#this is not right, but let's look at the result with the training feature data
titanic_dtr_model.predict(X.head())

array([1., 1., 0., 1., 1.])

In [111]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
1,2,1,1,38.0,1,0,71.2833
3,4,1,1,35.0,1,0,53.1
6,7,0,1,54.0,0,0,51.8625
10,11,1,3,4.0,1,1,16.7
11,12,1,1,58.0,0,0,26.55


In [112]:
#big data leakage problem, also we need a better way to test the result
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y, titanic_dtr_model.predict(X))

0.0

In [113]:
#let's use val data to test the result
test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,PassengerId_GS,Survived
12,904,1,23.0,1,0,82.2667,904,1
14,906,1,47.0,1,0,61.175,906,1
24,916,1,48.0,1,3,262.375,916,1
26,918,1,22.0,0,1,61.9792,918,1
28,920,1,41.0,0,0,30.5,920,0


In [114]:
gs_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [115]:
test_data.isnull().sum()

PassengerId       0
Pclass            0
Age               0
SibSp             0
Parch             0
Fare              0
PassengerId_GS    0
Survived          0
dtype: int64

In [116]:
test_data=test_data.dropna()

In [117]:
test_data = test_data.select_dtypes(exclude='object')

In [118]:
#just found that test data does not contain surival, need to merge the data from gs

In [119]:
test_data = test_data.join(gs_data, rsuffix='_GS')

In [120]:
test_data = test_data.dropna()

In [121]:
test_data = test_data.select_dtypes(exclude = 'object')

In [122]:
y_test = test_data.Survived

In [146]:
X_test = test_data.drop(['Survived_GS', 'PassengerId_GS'], axis = 1)

In [147]:
X_test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
12,904,1,23.0,1,0,82.2667,1
14,906,1,47.0,1,0,61.1750,1
24,916,1,48.0,1,3,262.3750,1
26,918,1,22.0,0,1,61.9792,1
28,920,1,41.0,0,0,30.5000,0
...,...,...,...,...,...,...,...
404,1296,1,43.0,1,0,27.7208,0
405,1297,2,20.0,0,0,13.8625,0
407,1299,1,50.0,1,1,211.5000,0
411,1303,1,37.0,1,0,90.0000,1


In [148]:
p_results = titanic_dtr_model.predict(X_test)

ValueError: Number of features of the model must match the input. Model n_features is 5 and input n_features is 7 

In [126]:
y_test

12     1
14     1
24     1
26     1
28     0
      ..
404    0
405    0
407    0
411    1
414    1
Name: Survived, Length: 87, dtype: int64

In [127]:
r_list = results == y_test

In [128]:
i = 0
for r in r_list:
    if r == True:
        i= i+1

In [129]:
result_per = i/len(r_list)
result_per

0.4827586206896552

In [134]:
#the accuracy is pretty bad, what if we train the model without the IDs
X_no_pid = X.drop('PassengerId', axis=1)

In [135]:
X_no_pid

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
1,1,38.0,1,0,71.2833
3,1,35.0,1,0,53.1000
6,1,54.0,0,0,51.8625
10,3,4.0,1,1,16.7000
11,1,58.0,0,0,26.5500
...,...,...,...,...,...
871,1,47.0,1,1,52.5542
872,1,33.0,0,0,5.0000
879,1,56.0,0,1,83.1583
887,1,19.0,0,0,30.0000


In [138]:
titanic_dtr_model.fit(X_no_pid,y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1, splitter='best')

In [144]:
X_test_nopid = X_test.drop('PassengerId', axis=1)

In [145]:
X_test_nopid

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived_GS
12,1,23.0,1,0,82.2667,1
14,1,47.0,1,0,61.1750,1
24,1,48.0,1,3,262.3750,1
26,1,22.0,0,1,61.9792,1
28,1,41.0,0,0,30.5000,0
...,...,...,...,...,...,...
404,1,43.0,1,0,27.7208,0
405,2,20.0,0,0,13.8625,0
407,1,50.0,1,1,211.5000,0
411,1,37.0,1,0,90.0000,1


In [143]:
titanic_dtr_model.predict(X_test_nopid)

ValueError: Number of features of the model must match the input. Model n_features is 5 and input n_features is 6 

In [None]:
#found that the Survived column is in in the test data, overlooked it, this is too massy, going to start another notebook with fresh data to try out other more accurate models, and better way to handle missing data. Also, from this excercise, we concluded that it would be better to create a branch for effort like this (rudimentry data explores)