In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

train_data = pd.read_csv('titanic/train.csv')
unknown_data = pd.read_csv('titanic/test.csv')

train_data.fillna(0, inplace=True)
unknown_data.fillna(0, inplace=True)

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S


In [104]:
train_data['Sex'] = (train_data['Sex'] == 'female').astype(int)
unknown_data['Sex'] = (unknown_data['Sex'] == 'female').astype(int)

In [105]:
feat_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']
X = train_data.loc[:,feat_cols]
y = train_data['Survived']

for col in X:
    print('{}: {}'.format(col, X[col].unique()))

Pclass: [3 1 2]
Sex: [0 1]
Age: [22.   38.   26.   35.    0.   54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]
SibSp: [1 0 3 4 2 5 8]
Fare: [  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55    31.275    7.8542  16.      29.125
  13.      18.       7.225   26.       8.0292  35.5     31.3875 263.
   7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708  52.
   7.2292  11.2417   9.475   21.      41.5792  15.5     21.6792  17.8
  39.6875   7.8     76.7292  61.9792  27.75    46.9     80

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [107]:
tree = DecisionTreeClassifier(max_depth=10)

In [108]:
tree.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [109]:
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))

0.9296407185628742
0.9327354260089686


In [110]:
X_new = unknown_data.loc[:, feat_cols]
X_new.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
0,3,0,34.5,0,7.8292
1,3,1,47.0,1,7.0
2,2,0,62.0,0,9.6875
3,3,0,27.0,0,8.6625
4,3,1,22.0,1,12.2875


In [111]:
final_tree = DecisionTreeClassifier(max_depth=10)
final_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [112]:
prediction = final_tree.predict(X_new)

In [113]:
pd.DataFrame({'PassengerId': unknown_data.PassengerId, 'Survived': prediction}).set_index('PassengerId').to_csv('sub5.csv')

In [116]:
prediction.shape

(418,)

In [171]:
forest = RandomForestClassifier(max_depth=6, n_estimators=10)

In [172]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [173]:
print(forest.score(X_train, y_train))
print(forest.score(X_test, y_test))

0.8607784431137725
0.820627802690583


In [174]:
prediction_2 = forest.predict(X_new)

In [175]:
pd.DataFrame({'PassengerId': unknown_data.PassengerId, 'Survived': prediction_2}).set_index('PassengerId').to_csv('sub6.csv')