In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
with open('/kaggle/input/titanic/train.csv') as infile:
    train = pd.read_csv(infile)
    
with open('/kaggle/input/titanic/test.csv') as infile:
    test = pd.read_csv(infile)

titanic = pd.concat([train, test])
titanic.dtypes.sort_values()

In [None]:
titanic.isnull().sum()[titanic.isnull().sum() > 0]

In [None]:
# Missing Value Imputation: Cabin
train.Cabin = train.Cabin.fillna('unknown')
test.Cabin = test.Cabin.fillna('unknown')

In [None]:
# Missing Value Imputation: Embarked
train.Embarked = train.Embarked.fillna(train.Embarked.mode()[0])
test.Embarked = test.Embarked.fillna(test.Embarked.mode()[0])

In [None]:
# Missing Value Imputation: Fare
train.Fare = train.Fare.fillna(train.Fare.mean())
test.Fare = test.Fare.fillna(test.Fare.mean())

In [None]:
# Missing Value Imputation Step1: Age

# Extract honorific titles from all names
train['Title'] = train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
test['Title'] = test.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
# Missing Value Imputation Step2: Age

# Normalized honorific titles
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

In [None]:
# Missing Value Imputation Step3: Age

# Map the normalized titles to the current titles
train.Title = train.Title.map(normalized_titles)
test.Title = test.Title.map(normalized_titles)

In [None]:
# Missing Value Imputation Step4: Age

train_grouped = train.groupby(['Title', 'Sex', 'Pclass'])
test_grouped = test.groupby(['Title', 'Sex', 'Pclass'])

train.Age = train_grouped.Age.apply(lambda x: x.fillna(x.mean()))
test.Age = test_grouped.Age.apply(lambda x: x.fillna(x.mean()))

# train.loc[(train['Title']=='Master') & (train['Sex']=='male') & (train['Pclass']==3)]

In [None]:
# Adding a new feature which represents the family size

# Family size per passenger = sibling/spouse + parent/children
train['FamilySize'] = train.SibSp + train.Parch + 1
test['FamilySize'] = test.SibSp + test.Parch + 1

In [None]:
# Cabin location might impact survival rate. So we extract the first letter of cabin which represent the cabin location.
train.Cabin = train.Cabin.map(lambda x: x[0])
test.Cabin = test.Cabin.map(lambda x: x[0])

In [None]:
y = train['Survived']

X = train.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)

test_PassengerId = test.PassengerId
test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

ct = ColumnTransformer(
    [('encoder', OneHotEncoder(), [0, 1, 6, 7, 8])], remainder='passthrough')

X_transformed = ct.fit_transform(X)
test_transformed = ct.transform(test)

In [None]:
#  Model pre-selection

models = [('LR', LogisticRegression()),
         ('NB', GaussianNB()),
         ('SVM', SVC()),
         ('KNN', KNeighborsClassifier()),
         ('RF', RandomForestClassifier())]

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=123)

for name, model in models:
    clf = model
    scores = cross_val_score(clf, X_transformed, y, cv=10, scoring='accuracy').mean()
    print(name, scores)

In [None]:
# Model selection between Random Forest and Logistic Regression

LR = LogisticRegression()

LR_param_grid = dict(
    penalty = ['l1', 'l2'],
    C = np.logspace(-4, 0, 4, 10, 20))

LR_grid = GridSearchCV(LR, LR_param_grid, cv=10)
LR_scores = cross_val_score(LR_grid, X_transformed, y, scoring='accuracy', cv=10).mean()


RF = RandomForestClassifier()
RF_param_grid = {'n_estimators':[10,100],
                 'max_depth':[3,6],
                 'criterion':['gini','entropy']}

RF_grid = GridSearchCV(RF, RF_param_grid, cv=10)
RF_scores = cross_val_score(RF_grid, X_transformed, y, scoring='accuracy', cv=10).mean()

print(LR_scores, RF_scores)

In [None]:
RF_grid.fit(X_transformed, y)

# grid.best_score_
RF_grid.best_params_
# grid.best_estimator_.get_params()

In [None]:
# Shortcut: GridSearchCV automatically refits the best model using all of the data
prediction = RF_grid.predict(test_transformed)

In [None]:
# dataframe with predictions
kaggle = pd.DataFrame({'PassengerId': test_PassengerId, 'Survived': prediction})

# save to csv
kaggle.to_csv('./titanic_pred.csv', index=False)