In [718]:
import pandas as pd


train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

full_data = [train, test]

In [719]:
train.shape, test.shape

((891, 12), (418, 11))

In [720]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [721]:
# Processing Name

title_mapping = {
    'Mr': 0,
    'Miss': 1,
    'Mrs': 2,
    'Master': 3
}

for dataset in full_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Title'] = dataset['Title'].apply(lambda x: title_mapping[x] if x in title_mapping else len(title_mapping))
    dataset.drop('Name', axis=1, inplace=True)

In [722]:
# Processing Sex

for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map({'male':1, 'female':0})

In [723]:
# Processing Age

for dataset in full_data:
    dataset['Age'].fillna(dataset.groupby('Title')['Age'].transform('median'), inplace=True)
    dataset['Age'] = dataset['Age'].apply(lambda x: x // 10).astype(int)

In [724]:
# Processing Embarked

embarked_mapping = {
    'S': 0,
    'C': 1,
    'Q': 2
}

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

In [725]:
# Processing Fare

for dataset in full_data:
    dataset['Fare'].fillna(dataset.groupby('Pclass')['Fare'].transform('median'), inplace=True)
    dataset['Fare'] = dataset['Fare'].apply(lambda x: x // 15).astype(int)

In [726]:
# Processing Cabin

cabin_mapping = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3, 
    'E': 4,
    'F': 5,
    'G': 6,
    'T': 7
}

for dataset in full_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)
    dataset['Cabin'].fillna(dataset.groupby('Pclass')['Cabin'].transform('median'), inplace=True)
    dataset['Cabin'] = dataset['Cabin'].astype(int)

In [727]:
# Processing Family (calculate family size)

for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['isAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'isAlone'] = 1

In [728]:
# Drop another columns

for dataset in full_data:
    dataset.drop(['SibSp', 'Parch', 'FamilySize', 'Ticket'], axis=1, inplace=True)

In [729]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,isAlone
0,1,0,3,1,2,0,5,0,0,0
1,2,1,1,0,3,4,2,1,2,0
2,3,1,3,0,2,0,5,0,1,1
3,4,1,1,0,3,3,2,0,2,0
4,5,0,3,1,3,0,5,0,0,1


In [735]:
X_train = train.drop(['Survived', 'PassengerId'], axis=1)
y_train = train['Survived']

X_test = test.drop('PassengerId', axis=1)
test_passenger_id = test['PassengerId']

In [736]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc = round(model.score(X_train, y_train), 3)
print(acc)

0.844


In [737]:
submission = pd.DataFrame({
        'PassengerId': test_passenger_id,
        'Survived': y_pred
    })
submission.to_csv('output/submission.csv', index=False)