In [41]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score

In [111]:
train_df = pd.read_csv("dataset/titanic/train.csv", low_memory=False)
y = train_df['Survived']
train_df.drop('Survived', inplace=True, axis=1)
test_df = pd.read_csv("dataset/titanic/test.csv", low_memory=False)

In [4]:
train_df.tail()
test_passenger_id = test_df['PassengerId']

In [5]:
train_df.shape, test_df.shape

((891, 11), (418, 11))

In [6]:
def split_cabin(row):
    return re.findall(r"[^\W\d]+", str(row))[0]

In [7]:
def transform_cols(df):
    
    # Sex
    df['Sex'] = df.Sex.astype('category').cat.codes
    
    # Embarked
    df['Embarked'] = df.Embarked.astype('category').cat.codes
    
    # Cabin
    df.Cabin.fillna('ABC123', inplace=True)
    df['cabin_name'] = df.Cabin.apply(split_cabin)
    df['cabin_name'] = df.cabin_name.astype('category').cat.codes
    
    # Fare
    df.Fare.fillna(1, inplace=True)
    df.loc[((df.Fare >= 0) & (df.Fare <= 10)), 'fare'] = 1
    df.loc[((df.Fare > 10) & (df.Fare <= 100)), 'fare'] = 2
    df.loc[df.Fare > 100, 'fare'] = 3
    
    # Age
    df.Age.fillna(np.median(df[df.Age > 0]['Age'].values), inplace=True)
    df.loc[(df.Age <= 12), 'age'] = 1
    df.loc[((df.Age > 12) & (df.Age <= 18)), 'age'] = 2
    df.loc[((df.Age > 18) & (df.Age <= 34)), 'age'] = 3
    df.loc[((df.Age > 34) & (df.Age <= 50)), 'age'] = 4
    df.loc[(df.Age > 50), 'age'] = 5
    
    return df

In [8]:
def drop_cols(df):
    
    df.drop('Ticket', inplace=True, axis=1)
    df.drop('Cabin', inplace=True, axis=1)
    df.drop('Name', inplace=True, axis=1)
    df.drop('Fare', inplace=True, axis=1)
    df.drop('PassengerId', inplace=True, axis=1)
    df.drop('Age', inplace=True, axis=1)
    
    return df

In [9]:
train_df = transform_cols(train_df)
train_df = drop_cols(train_df)

test_df = transform_cols(test_df)
test_df = drop_cols(test_df)

In [84]:
train_df.shape, test_df.shape

((891, 39), (418, 39))

In [81]:
train_df = train_df.abs()

In [82]:
train_df = OneHotEncoder().fit_transform(train_df)

In [83]:
test_df = OneHotEncoder().fit_transform(test_df)

In [85]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, random_state=42, test_size=.2)

In [86]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((712, 39), (179, 39), (712,), (179,))

In [87]:
parameters = {'n_estimators':[i for i in range(40) if i%2!=0], 
              'min_samples_split': [i for i in range(1,10) if i%2==0]}

In [88]:
m = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=-1)

In [89]:
clf = GridSearchCV(m, parameters)

In [90]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=True, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39], 'min_samples_split': [2, 4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [91]:
clf.best_params_

{'min_samples_split': 6, 'n_estimators': 7}

In [106]:
m = RandomForestClassifier(n_estimators=7, min_samples_split=6, n_jobs=-1, oob_score=True, random_state=42)

In [107]:
m.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=7, n_jobs=-1, oob_score=True, random_state=42,
            verbose=0, warm_start=False)

In [108]:
y_pred = m.predict(X_valid)

In [109]:
accuracy_score(y_pred, y_valid)

0.78212290502793291

In [103]:
test_pred = m.predict(test_df)

In [104]:
submission = pd.DataFrame({
        "PassengerId": test_passenger_id,
        "Survived": test_pred
    })

In [105]:
submission.to_csv("/home/raavan/Downloads/gender_submission_3.csv", index=False)

In [112]:
train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [113]:
train_df.SibSp.value_counts(dropna=False)

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [114]:
train_df.Parch.value_counts(dropna=False)

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64