In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import sklearn.metrics

import eli5
from eli5.sklearn import PermutationImportance

In [None]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('test.csv')

# Data Exploration

In [None]:
train_data.head()

Description of different columns can be found [here](https://www.kaggle.com/c/titanic/data)

In [None]:
train_data[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_data[['Sex', 'Survived']].groupby(['Sex']).mean().sort_values(by='Survived', ascending=False)

In [None]:
suvived_mean_fare = train_data[train_data['Survived'] == 1]['Fare'].mean()
dead_mean_fare = train_data[train_data['Survived'] == 0]['Fare'].mean()

pd.DataFrame({'survived': [1,0], 'mean fare': [suvived_mean_fare, dead_mean_fare]})

In [None]:
suvived_mean_age = train_data[train_data['Survived'] == 1]['Age'].mean()
dead_mean_age = train_data[train_data['Survived'] == 0]['Age'].mean()

pd.DataFrame({'survived': [1,0], 'mean age': [suvived_mean_age, dead_mean_age]})

In [None]:
train_data[['Embarked', 'Survived']].groupby(['Embarked']).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_data[['SibSp', 'Survived']].groupby(['SibSp']).mean().sort_values(by='SibSp', ascending=False)

In [None]:
def is_alone(row):
    if row.SibSp + row.Parch == 0:
        row['isAlone'] = 1
    else:
        row['isAlone'] = 0
    return row

train_data.apply(is_alone, axis='columns')[['isAlone','Survived']].groupby(['isAlone']).mean().sort_values(by='isAlone', ascending=False)

# Data Wrangling

In [None]:
missing_val_count_by_column = (train_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
# Dropping Cabin as there is just to many missing Values in this column
train_data = train_data.drop(['Cabin'], axis = 1)
test_data = test_data.drop(['Cabin'], axis = 1)

In [None]:
# Filling missed Embarked information with S, as it is the most common.
train_data['Embarked'] = train_data['Embarked'].fillna('S')
test_data['Embarked'] = test_data['Embarked'].fillna('S')

In [None]:
# Creating isAlone property using SibSp and Parch
train_data = train_data.apply(is_alone, axis='columns')
test_data = test_data.apply(is_alone, axis='columns')

In [None]:
train_data

In [None]:
combine = [train_data, test_data]

for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [None]:
feature_colums = ['Pclass', 'Age', 'Sex', 'isAlone','Embarked', 'Fare', 'Title']

y = train_data.Survived
X = train_data[feature_colums]
X_test = test_data[feature_colums]

In [None]:
# Using SimpleImputer to set mean for missing values

my_imputer = SimpleImputer()
X['Age'] = pd.DataFrame(my_imputer.fit_transform(np.array(X['Age']).reshape(-1,1)))
X_test['Age'] = pd.DataFrame(my_imputer.fit_transform(np.array(X_test['Age']).reshape(-1,1)))
X_test['Fare'] = pd.DataFrame(my_imputer.fit_transform(np.array(X_test['Fare']).reshape(-1,1)))

In [None]:
#Change Age and Fare to ordinal scale. Featrure reduction
combine = [X, X_test]

for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']

In [None]:
OH_columns = ['Sex', 'Embarked']

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_X = pd.DataFrame(OH_encoder.fit_transform(X[OH_columns]))
OH_X_test = pd.DataFrame(OH_encoder.fit_transform(X_test[OH_columns]))

In [None]:
OH_X_train = pd.concat([X.drop(OH_columns, axis=1), OH_X], axis=1)
OH_X_test = pd.concat([X_test.drop(OH_columns, axis=1), OH_X_test], axis=1)

In [None]:
OH_X_train = OH_X_train.rename(columns={0: 'Female', 1: 'Male', 2: 'Embarked_C', 3: 'Embarked_Q', 4: 'Embarked_S'})
OH_X_test = OH_X_test.rename(columns={0: 'Female', 1: 'Male', 2: 'Embarked_C', 3: 'Embarked_Q', 4: 'Embarked_S'})

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

combine = [OH_X_train, OH_X_test]

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [None]:
OH_X_test

In [None]:
OH_X_train

# Feature Analysis

In [None]:
train_X, val_X, train_y, val_y = train_test_split(OH_X_train, y, random_state = 0)

logreg = LogisticRegression()
logreg.fit(train_X, train_y)
Y_pred = logreg.predict(val_X)
print(accuracy_score(val_y, Y_pred))

In [None]:
coeff_df = pd.DataFrame(train_X.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df

# Train the Model

In [None]:
model = XGBRegressor(n_estimators=700, learning_rate=0.07)

In [None]:
#validate model without cross validation
train_X, val_X, train_y, val_y = train_test_split(OH_X_train, y, random_state = 0)
model.fit(train_X, train_y)

val_predictions = model.predict(val_X).round().astype(int)
print(accuracy_score(val_y, val_predictions))
mean_absolute_error(val_y, val_predictions)

In [None]:
scores = -1 * cross_val_score(model, OH_X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
scores.mean()

In [None]:
test_prediction = model.predict(OH_X_test).round().astype(int)

In [None]:
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_prediction})

In [None]:
result['Survived'].value_counts()

In [None]:
result.to_csv('results.csv', index = None)

# Model Analysis

In [None]:
perm = PermutationImportance(model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())