In [269]:
import numpy as np
import pandas as pd
import os

path = "titanic"

for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))


titanic\gender_submission.csv
titanic\submission.csv
titanic\test.csv
titanic\train.csv


In [270]:
from sklearn import model_selection

train_data = pd.read_csv("titanic/train.csv")
train_data.fillna(method="backfill", inplace=True)

features = ["Pclass", "Sex", "Age", "SibSp",
            "Parch", "Fare", "Embarked"]
x = train_data[features]
y = train_data["Survived"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    x, y, test_size=0.1, random_state=8)

X_train.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
889,1,male,26.0,0,0,30.0,C
529,2,male,23.0,2,1,11.5,S
233,3,female,5.0,4,2,31.3875,S
253,3,male,30.0,1,0,16.1,S
348,3,male,3.0,1,1,15.9,S


In [271]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
350,3,male,23.0,0,0,9.225,S
176,3,male,50.0,3,1,25.4667,S
723,2,male,50.0,0,0,13.0,S
404,3,female,20.0,0,0,8.6625,S
306,1,female,17.0,0,0,110.8833,C


In [272]:
y_train.head()

889    1
529    0
233    1
253    0
348    1
Name: Survived, dtype: int64

In [273]:
y_test.head()

350    0
176    0
723    0
404    0
306    1
Name: Survived, dtype: int64

In [274]:
# filter the data which sex is female and get the Survived column
women = train_data.loc[train_data.Sex == 'female']['Survived']

# use sum operation could get the number of women who survived
rate_women = sum(women)/len(women)

print('% of women who survived:', rate_women)

% of women who survived: 0.7420382165605095


In [275]:
men = train_data.loc[train_data.Sex == 'male']['Survived']
rate_men = sum(men)/len(men)

print('% of men who survived:', rate_men)

% of men who survived: 0.18890814558058924


In [276]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = pd.get_dummies(X_train)

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y_train)

RandomForestClassifier(max_depth=5, random_state=1)

In [277]:
# test model
X_test= pd.get_dummies(X_test)
X_test.head()
X_pred = model.predict(X_test)
print(X_pred)
accuracy_score(X_pred, y_test)

[0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1
 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0
 0 0 1 1 1 0 1 1 0 1 1 0 0 1 0 1]


0.7888888888888889

In [281]:
test_data = pd.read_csv("titanic/test.csv")
test_data.fillna(method="ffill", inplace=True)
test_data.head()

features = ["Pclass", "Sex", "Age", "SibSp",
            "Parch", "Fare", "Embarked"]
X_test = pd.get_dummies(test_data[features])
predictions = model.predict(X_test)
output = pd.DataFrame(
    {'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('titanic/submission.csv', index=False)
print('Your submission was successfully saved!')

Your submission was successfully saved!
