In [26]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

holdout.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [27]:
# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [28]:
def prep_data(df):
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_cabin(df)
    df = process_titles(df)
    df = create_dummies(df,['Age_categories','Fare_categories','Title','Cabin_type','Sex'])
    return df

train = prep_data(train)
holdout = prep_data(holdout)

In [29]:
def isalone(df):
    df['family_size'] = df[['SibSp','Parch']].sum(axis=1)
    df['isalone'] = 0
    df.loc[(df.family_size==0),'isalone']=1
    df.loc[:,'family_size'] = (df.family_size - df.family_size.min())/(df.family_size.max() - df.family_size.min())
    return df

train = isalone(train)
holdout = isalone(holdout)

In [35]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

def select_features(df,classifier):
    df = df.select_dtypes([np.number]).dropna(axis=1)
    all_X = df.drop(['PassengerId','Survived'],axis=1)
    all_y = df.Survived
    
    selector = RFECV(classifier,cv=10)
    selector.fit(all_X,all_y)
    
    best_columns = list(all_X.columns[selector.support_])
    #print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))
    
    return best_columns

best_features = select_features(train,RandomForestClassifier())




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

def select_model(df):
    
    list_of_dicts = [
#         {'name':'LogisticRegression',
#          'estimator':LogisticRegression(),
#          'hyperparameters':{
#              'solver':['newton-cg','lbfgs','liblinear']
#          }
#         },
#         {'name':'KNeighborsClassifier',
#          'estimator':KNeighborsClassifier(),
#          'hyperparameters':{
#              'n_neighbors': range(1,20,2),
#              'weights': ['distance','uniform'],
#              'algorithm': ['ball_tree','kd_tree','brute'],
#              'p': [1,2]
#          }
#         },
#         {'name':'RandomForestClassifier',
#          'estimator':RandomForestClassifier(),
#          'hyperparameters':{
#              'n_estimators': [4,6,9],
#              'criterion': ['entropy', 'gini'],
#              'max_depth': [2,5,10],
#              'max_features': ['log2','sqrt'],
#              'min_samples_leaf': [1,5,8],
#              'min_samples_split': [2,3,5]
#          }
#         },
        {'name':'SVC',
         'estimator':SVC(),
         'hyperparameters':{
             'C': [0.1,1],
             'kernel': ['rbf'],
             'gamma': [0.001,0.01,0.1,1],
             #'degree': [3,5]
              }
          }#,
#          {'name':'BernoulliNB',
#          'estimator':BernoulliNB(),
#          'hyperparameters':{
#             'alpha': [0,0.5,1],
#             'binarize': 0.5
#              }
#          }
    ]
    
    all_y = df.Survived
    
    for dictt in list_of_dicts:
        if dictt['name'] not in ['KNeighborsClassifier','SVC','BernoulliNB']:
            dictt['best_features'] = select_features(df,dictt['estimator'])
            all_X = df[dictt['best_features']]
        else:
            df = df.select_dtypes([np.number]).dropna(axis=1)
            all_X = df.drop(['PassengerId','Survived'],axis=1)
            dictt['best_features'] = list(all_X.columns)
        print(dictt['name'])
        print('-'*len(dictt['name']))
        grid = GridSearchCV(dictt['estimator'],param_grid=dictt['hyperparameters'],cv=10)
        grid.fit(all_X,all_y)
        dictt['best_params'] = grid.best_params_
        dictt['best_score'] = grid.best_score_
        dictt['best_model'] = grid.best_estimator_
        print("Best Score: {}".format(dictt["best_score"]))
        print("Best Parameters: {}\n".format(dictt["best_params"]))
    return list_of_dicts

optimized_models = select_model(train)
    

SVC
---


In [None]:
def save_submission_file(model,best_features,
                         filename='submission.csv'):
    holdout_predictions = model.predict(holdout[best_features])
    submission = pd.DataFrame({'PassengerId':holdout.PassengerId,
                               'Survived':holdout_predictions})
    submission.to_csv(filename,index=False)

model = optomized_models[2]['best_model']
best_features = optomized_models[2]['best_features']
save_submission_file(model,best_features)



In [24]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior,Fare_categories_0-12,Fare_categories_12-50,Fare_categories_50-100,Fare_categories_100+,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Cabin_type_A,Cabin_type_B,Cabin_type_C,Cabin_type_D,Cabin_type_E,Cabin_type_F,Cabin_type_G,Cabin_type_T,Cabin_type_Unknown,Sex_female,Sex_male,family_size,isalone
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,23.699966,0.523008,0.381594,32.204208,0.198653,0.049383,0.028058,0.078563,0.401796,0.218855,0.024691,0.419753,0.400673,0.12009,0.059484,0.044893,0.20651,0.580247,0.142536,0.020202,0.005612,0.016835,0.05275,0.066218,0.037037,0.035915,0.01459,0.004489,0.001122,0.771044,0.352413,0.647587,0.09046,0.602694
std,257.353842,0.486592,0.836071,17.731181,1.102743,0.806057,49.693429,0.39921,0.216787,0.165232,0.269207,0.490536,0.413702,0.15527,0.493796,0.49031,0.325249,0.236661,0.207186,0.405028,0.493796,0.349796,0.14077,0.074743,0.128725,0.223659,0.248802,0.188959,0.186182,0.119973,0.06689,0.033501,0.420397,0.47799,0.47799,0.161346,0.489615
min,1.0,0.0,1.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,6.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,3.0,24.0,0.0,0.0,14.4542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
