In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

test_df = pd.read_csv('aml_test.csv')
train_df = pd.read_csv('aml_train.csv')

In [2]:
# transform test data

test_df.drop(['Last Name'], axis = 1, inplace = True)
test_df.drop(['Fare'], axis = 1, inplace = True)

test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})

# Replace missing Age's with the average Age of each specific Title group
means = []

means.append(test_df.loc[test_df['Title'] == 'Mr.']['Age'].mean())
means.append(test_df.loc[test_df['Title'] == 'Miss.']['Age'].mean())
means.append(test_df.loc[test_df['Title'] == 'Mrs.']['Age'].mean())
means.append(test_df.loc[test_df['Title'] == 'Master.']['Age'].mean())
means.append(test_df.loc[test_df['Title'] == 'Rare']['Age'].mean())

test_df.loc[test_df['Title'] == 'Mr.'] = test_df.loc[test_df['Title'] == 'Mr.'].fillna(means[0])
test_df.loc[test_df['Title'] == 'Miss.'] = test_df.loc[test_df['Title'] == 'Miss.'].fillna(means[1])
test_df.loc[test_df['Title'] == 'Mrs.'] = test_df.loc[test_df['Title'] == 'Mrs.'].fillna(means[2])
test_df.loc[test_df['Title'] == 'Master.'] = test_df.loc[test_df['Title'] == 'Master.'].fillna(means[3])
test_df.loc[test_df['Title'] == 'Rare'] = test_df.loc[test_df['Title'] == 'Rare'].fillna(means[4])

# Create dummy variables for each Title
title_dummies = pd.get_dummies(test_df['Title'])
test_df = pd.concat([test_df, title_dummies], axis = 1)
test_df.drop(['Title'], axis = 1, inplace = True)

In [3]:
# transform train data

train_df.drop(['Last Name'], axis = 1, inplace = True)
train_df.drop(['Fare'], axis = 1, inplace = True)

train_df['Sex'] = train_df['Sex'].map({'male': 1, 'female': 0})

# Replace missing Age's with the average Age of each specific Title group

means = []

means.append(train_df.loc[train_df['Title'] == 'Mr.']['Age'].mean())
means.append(train_df.loc[train_df['Title'] == 'Miss.']['Age'].mean())
means.append(train_df.loc[train_df['Title'] == 'Mrs.']['Age'].mean())
means.append(train_df.loc[train_df['Title'] == 'Master.']['Age'].mean())
means.append(train_df.loc[train_df['Title'] == 'Rare']['Age'].mean())

train_df.loc[train_df['Title'] == 'Mr.'] = train_df.loc[train_df['Title'] == 'Mr.'].fillna(means[0])
train_df.loc[train_df['Title'] == 'Miss.'] = train_df.loc[train_df['Title'] == 'Miss.'].fillna(means[1])
train_df.loc[train_df['Title'] == 'Mrs.'] = train_df.loc[train_df['Title'] == 'Mrs.'].fillna(means[2])
train_df.loc[train_df['Title'] == 'Master.'] = train_df.loc[train_df['Title'] == 'Master.'].fillna(means[3])
train_df.loc[train_df['Title'] == 'Rare'] = train_df.loc[train_df['Title'] == 'Rare'].fillna(means[4])

# Create dummy variables for each Title
title_dummies = pd.get_dummies(train_df['Title'])
train_df = pd.concat([train_df, title_dummies], axis = 1)
train_df.drop(['Title'], axis = 1, inplace = True)

In [5]:
# Split data for model

train_passenger = train_df['PassengerId']
test_passenger = test_df['PassengerId']

X_train = train_df.drop(['Survived', 'PassengerId'], axis = 1)
y_train = train_df['Survived']

X_test = test_df.drop(['PassengerId'], axis=1)


In [6]:
print('X_train shape: {}\ny_train shape: {}\nX_test shape: {}'.format(X_train.shape, y_train.shape, X_test.shape))

X_train shape: (891, 14)
y_train shape: (891,)
X_test shape: (418, 14)


In [9]:
# Logistic Regression
log_reg = LogisticRegression()


log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

acc_log = round(log_reg.score(X_train, y_train) * 100, 2)

print('Logistic Regression Accuracy: {}\n'.format(acc_log))

Logistic Regression Accuracy: 81.93





In [7]:
# Random Forest
rand_for = RandomForestClassifier()


rand_for.fit(X_train, y_train)
y_pred = rand_for.predict(X_test)

acc_log = round(rand_for.score(X_train, y_train) * 100, 2)

print('Logistic Regression Accuracy: {}\n'.format(acc_log))

Logistic Regression Accuracy: 91.58





In [8]:
y_pred_df = pd.DataFrame(data = y_pred, columns = ['Survived'])
final_df = pd.concat([test_passenger, y_pred_df], axis = 1)

# Save to either Logistic Re
# final_df.to_csv('Titanic_Logistic_Regression_Solutions.csv', index = False)
final_df.to_csv('Titanic_Random_Forest_Solutions.csv', index = False)