In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

holdout.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df.loc[df['Cabin_type']=='T','Cabin_type'] = 'Unknown'
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [3]:
def prep_data(df):
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_cabin(df)
    df = process_titles(df)
    df = create_dummies(df,['Age_categories','Fare_categories','Title','Cabin_type','Sex'])
    return df

train = prep_data(train)
holdout = prep_data(holdout)
holdout['Cabin_type_T'] = 0

In [4]:
def isalone(df):
    df['family_size'] = df[['SibSp','Parch']].sum(axis=1)
    df['isalone'] = 0
    df.loc[(df.family_size==0),'isalone']=1
    df.loc[:,'family_size'] = (df.family_size - df.family_size.min())/(df.family_size.max() - df.family_size.min())
    return df

train = isalone(train)
holdout = isalone(holdout)

In [5]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

def select_features(df,classifier):
    df = df.select_dtypes([np.number]).dropna(axis=1)
    all_X = df.drop(['PassengerId','Survived'],axis=1)
    all_y = df.Survived
    
    selector = RFECV(classifier,cv=10)
    selector.fit(all_X,all_y)
    
    best_columns = list(all_X.columns[selector.support_])
    #print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))
    
    return best_columns

best_features = select_features(train,RandomForestClassifier())




In [6]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC

def select_model(df):
    
    list_of_dicts = [
        {'name':'LogisticRegression',
         'estimator':LogisticRegression(),
         'hyperparameters':{
             'solver':['newton-cg','lbfgs','liblinear']
         }
        },
        {'name':'KNeighborsClassifier',
         'estimator':KNeighborsClassifier(),
         'hyperparameters':{
             'n_neighbors': range(1,20,2),
             'weights': ['distance','uniform'],
             'algorithm': ['ball_tree','kd_tree','brute'],
             'p': [1,2]
         }
        },
        {'name':'RandomForestClassifier',
         'estimator':RandomForestClassifier(),
         'hyperparameters':{
             'n_estimators': [4,6,9],
             'criterion': ['entropy', 'gini'],
             'max_depth': [2,5,10],
             'max_features': ['log2','sqrt'],
             'min_samples_leaf': [1,5,8],
             'min_samples_split': [2,3,5]
         }
        },
        {'name':'SVC',
         'estimator':SVC(),
         'hyperparameters':{
             'C': [1,10,100,1000,10000],
             'kernel': ['rbf'],
             'gamma': [0.00001,0.0001,0.001,0.01,0.1]
              }
          },
         {'name':'BernoulliNB',
         'estimator':BernoulliNB(),
         'hyperparameters':{
            'alpha': np.linspace(0,0.1,101),
            'binarize': [0.5]
             }
         }
    ]
    
    all_y = df.Survived
    
    for dictt in list_of_dicts:
        if dictt['name'] not in ['KNeighborsClassifier','SVC','BernoulliNB']:
            dictt['best_features'] = select_features(df,dictt['estimator'])
            all_X = df[dictt['best_features']]
        else:
            df = df.select_dtypes([np.number]).dropna(axis=1)
            all_X = df.drop(['PassengerId','Survived'],axis=1)
            dictt['best_features'] = list(all_X.columns)
        print(dictt['name'])
        print('-'*len(dictt['name']))
        grid = GridSearchCV(dictt['estimator'],param_grid=dictt['hyperparameters'],cv=10)
        grid.fit(all_X,all_y)
        dictt['best_params'] = grid.best_params_
        dictt['best_score'] = grid.best_score_
        dictt['best_model'] = grid.best_estimator_
        print("Best Score: {}".format(dictt["best_score"]))
        print("Best Parameters: {}\n".format(dictt["best_params"]))
    return list_of_dicts

optimized_models = select_model(train)
    

LogisticRegression
------------------
Best Score: 0.8215488215488216
Best Parameters: {'solver': 'lbfgs'}

KNeighborsClassifier
--------------------
Best Score: 0.7755331088664422
Best Parameters: {'algorithm': 'kd_tree', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}

RandomForestClassifier
----------------------
Best Score: 0.8428731762065096
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 6}

SVC
---
Best Score: 0.8294051627384961
Best Parameters: {'C': 10000, 'gamma': 1e-05, 'kernel': 'rbf'}

BernoulliNB
-----------
Best Score: 0.7856341189674523
Best Parameters: {'alpha': 0.0, 'binarize': 0.5}



In [8]:
def save_submission_file(model,best_features,
                         filename='submission.csv'):
    holdout_predictions = model.predict(holdout[best_features])
    submission = pd.DataFrame({'PassengerId':holdout.PassengerId,
                               'Survived':holdout_predictions})
    submission.to_csv(filename,index=False)

model = optimized_models[0]['best_model']
best_features = optimized_models[0]['best_features']
save_submission_file(model,best_features)



Random Forest gave the best accuracy of 79.4% on kaggle.com
SVC gave a score of 78.5%