In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [244]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score

In [245]:
train_df = pd.read_csv("dataset/titanic/train.csv", low_memory=False)
y = train_df['Survived']
train_df.drop('Survived', inplace=True, axis=1)
test_df = pd.read_csv("dataset/titanic/test.csv", low_memory=False)

In [246]:
test_passenger_id = test_df['PassengerId']

In [247]:
train_df.shape, test_df.shape

((891, 11), (418, 11))

In [248]:
def split_cabin(row):
    return re.findall(r"[^\W\d]+", str(row))[0]

In [249]:
def get_title(name):
    return name.split(', ')[1].split('.')[0]

In [250]:
def transform_cols(df):
    
    # Sex
    df['Sex'] = df.Sex.astype('category').cat.codes
    
    # Embarked
    df['Embarked'] = df.Embarked.astype('category').cat.codes
    
    # Cabin
    df.Cabin.fillna('ABC123', inplace=True)
    df['cabin_name'] = df.Cabin.apply(split_cabin)
    df['cabin_name'] = df.cabin_name.astype('category').cat.codes
    
    # Fare
    df.Fare.fillna(1, inplace=True)
    df.loc[((df.Fare >= 0) & (df.Fare <= 10)), 'fare'] = 1
    df.loc[((df.Fare > 10) & (df.Fare <= 100)), 'fare'] = 2
    df.loc[df.Fare > 100, 'fare'] = 3
    
    # Age
    df.Age.fillna(np.median(df[df.Age > 0]['Age'].values), inplace=True)
    df.loc[(df.Age <= 12), 'age'] = 1
    df.loc[((df.Age > 12) & (df.Age <= 18)), 'age'] = 2
    df.loc[((df.Age > 18) & (df.Age <= 34)), 'age'] = 3
    df.loc[((df.Age > 34) & (df.Age <= 50)), 'age'] = 4
    df.loc[(df.Age > 50), 'age'] = 5
    
    # Name
    df['title'] = df.Name.apply(get_title)
    df['title'] = df.title.replace(['Don', 'Rev', 'Dr', 'Mme', 'Major', 'Lady', 
                                            'Sir', 'Mlle', 'Col', 'Capt','the Countess',
                                            'Jonkheer'], 'Rare')
    df['title'] = df.title.replace(['Ms'], 'Miss')
    df['title'] = df.title.astype('category').cat.codes
    
    # Family
    df["Fsize"] = df["SibSp"] + df["Parch"] + 1
    df.loc[(df.Fsize == 1), 'fsize'] = 1
    df.loc[((df.Fsize == 2)), 'fsize'] = 2
    df.loc[((df.Fsize >= 3) & (df.Fsize <= 4)), 'fsize'] = 3
    df.loc[((df.Fsize > 4)), 'fsize'] = 4

    return df

In [251]:
def drop_cols(df):
    
    df.drop('Ticket', inplace=True, axis=1)
    df.drop('Cabin', inplace=True, axis=1)
    df.drop('Name', inplace=True, axis=1)
    df.drop('Fare', inplace=True, axis=1)
    df.drop('PassengerId', inplace=True, axis=1)
    df.drop('Age', inplace=True, axis=1)
    df.drop('Fsize', inplace=True, axis=1)
    
    return df

In [252]:
train_df = transform_cols(train_df)
train_df = drop_cols(train_df)

In [253]:
test_df = transform_cols(test_df)
test_df = drop_cols(test_df)

In [254]:
train_df.shape, test_df.shape

((891, 10), (418, 10))

In [255]:
train_df = train_df.abs()
test_df = test_df.abs()

In [256]:
train_objs_num = len(train_df)
full_data = pd.concat(objs=[train_df, test_df], axis=0)
full_data = pd.get_dummies(full_data)

In [257]:
full_data = OneHotEncoder().fit_transform(full_data)

In [258]:
train_df = full_data[:train_objs_num]
test_df = full_data[train_objs_num:]

In [259]:
train_df.shape, test_df.shape

((891, 50), (418, 50))

In [260]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, random_state=42, test_size=.2)

In [261]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((712, 50), (179, 50), (712,), (179,))

In [262]:
parameters = {'n_estimators':[i for i in range(1,40)], 
              'min_samples_split': [i for i in range(2,10)]}

In [263]:
m = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=-1)

In [264]:
clf = GridSearchCV(m, parameters)

In [265]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=True, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [266]:
clf.best_params_

{'min_samples_split': 9, 'n_estimators': 34}

In [267]:
m = RandomForestClassifier(n_estimators=34, min_samples_split=9, n_jobs=-1, oob_score=True, random_state=42)

In [268]:
m.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=9, min_weight_fraction_leaf=0.0,
            n_estimators=34, n_jobs=-1, oob_score=True, random_state=42,
            verbose=0, warm_start=False)

In [269]:
y_pred = m.predict(X_valid)

In [270]:
accuracy_score(y_pred, y_valid)

0.83240223463687146

In [271]:
test_pred = m.predict(test_df)

In [272]:
submission = pd.DataFrame({
        "PassengerId": test_passenger_id,
        "Survived": test_pred
    })

In [273]:
submission.to_csv("/home/raavan/Downloads/gender_submission_5.csv", index=False)