### Class 3 Lab

Approximate Time:  30-40 minutes

**Your Instructions**:  Now that we've submitted two different baseline versions of our model, see if you can go ahead and find some improvements to be made.  

Some ideas to try out:

 - Adding in new variables, like Age, Fare, Embarked, etc, and see how they do.  
 - Transforming variables like Name, Cabin, and possibly Ticket to make them more manageable and easy to discern
 - Creating variables to test whether or not someone was alone, whether or not they were traveling in a group (can use Ticket for this), etc
 - Random Forests need good values to split on.  Sometimes they perform better if you 'bin' a quantitative column so there are more values on each side of a split.  
 - Try out different versions of alpha, l1 & l2 penalties for LogisticRegression
 - Try different parameters of a Random Forest as well to see if it fits better
 - Make sure to use cross-validation to make comparisons between your validation and test sets

Some things to keep in mind:

 - There are missing values in the Fare, Age, Embarked and Cabin column
 - Remember the rules for transforming categorical data:
   - Ordinal: encode them with increasing numeric values to represent the hierarchy that's already in play
   - Nominal: dummy encode them like we did in the previous exercise
 - See if you can use the transform method to fill in the missing values for Age according to passenger characteristics
   


In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

train = pd.read_csv('../data/titanic/train.csv')
test = pd.read_csv('../data/titanic/test.csv')

In [9]:
def title_getter(df):
    df['title'] = pd.Series([x[0].split(',')[1].lstrip() + '.' for x in df['Name'].str.split('.')])
    return df

In [15]:
def age_simplifier(df):
    df['Age'].fillna(-0.5, inplace = True)
    
    conditions = [
    (df['Age'] < 8),
    (df['Age'] >= 8) & (df['Age'] < 22),
    (df['Age'] >= 22) & (df['Age'] <= 55),
    (df['Age'] > 55)
    ]
    status = ['Child','Adolescent','Adult','Senior']
    df['Demographic'] = np.select(conditions, status, 'undefined')
    return df

In [18]:
def cabin_simplifier(df):
    df['Cabin'].fillna('N',inplace = True)
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    return df

In [48]:
train['Embarked'].fillna('N', inplace = True)
test['Embarked'].fillna('N', inplace = True)

In [49]:
test['Fare'].fillna(test['Fare'].mean(), inplace = True)

In [50]:
title_getter(train)
age_simplifier(train)
cabin_simplifier(train)
title_getter(test)
age_simplifier(test)
cabin_simplifier(test)

In [51]:
train.drop(['Name','Ticket'], axis = 1, inplace = True)

In [52]:
test.drop(['Name','Ticket'], axis = 1, inplace = True)

In [53]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,Demographic
0,1,0,3,male,22.0,1,0,7.25,N,S,Mr.,Adult
1,2,1,1,female,38.0,1,0,71.2833,C,C,Mrs.,Adult
2,3,1,3,female,26.0,0,0,7.925,N,S,Miss.,Adult
3,4,1,1,female,35.0,1,0,53.1,C,S,Mrs.,Adult
4,5,0,3,male,35.0,0,0,8.05,N,S,Mr.,Adult


In [54]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,Demographic
0,892,3,male,34.5,0,0,7.8292,N,Q,Mr.,Adult
1,893,3,female,47.0,1,0,7.0,N,S,Mrs.,Adult
2,894,2,male,62.0,0,0,9.6875,N,Q,Mr.,Senior
3,895,3,male,27.0,0,0,8.6625,N,S,Mr.,Adult
4,896,3,female,22.0,1,1,12.2875,N,S,Mrs.,Adult


In [55]:
test.columns.tolist()

['PassengerId',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Cabin',
 'Embarked',
 'title',
 'Demographic']

In [56]:
feat = ['Sex', 'Cabin', 'Embarked', 'title','Demographic']

In [59]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [61]:
for f in feat:
    train[f] = le.fit_transform(train[f])
    test[f] = le.fit_transform(test[f])

In [62]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,Demographic
0,1,0,3,1,22.0,1,0,7.25,7,3,11,1
1,2,1,1,0,38.0,1,0,71.2833,2,0,12,1
2,3,1,3,0,26.0,0,0,7.925,7,3,8,1
3,4,1,1,0,35.0,1,0,53.1,2,3,12,1
4,5,0,3,1,35.0,0,0,8.05,7,3,11,1


In [63]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,Demographic
0,892,3,1,34.5,0,0,7.8292,7,1,5,1
1,893,3,0,47.0,1,0,7.0,7,2,6,1
2,894,2,1,62.0,0,0,9.6875,7,1,5,3
3,895,3,1,27.0,0,0,8.6625,7,2,5,1
4,896,3,0,22.0,1,1,12.2875,7,2,6,1


In [70]:
rfc = RandomForestClassifier()
rfc.fit(train.iloc[:,2:], train['Survived'])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
rfc_preds_clean = rfc.predict(test.iloc[:,1:])

In [74]:
rfc_preds_clean

array([0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,

In [76]:
rfc_predictions = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': rfc_preds_clean
})
rfc_predictions.to_csv('../data/titanic/rfc_predictions1.csv',
                       index = False)

In [None]:
logreg = LogisticRegression()

In [79]:
param_grid = {
    'penalty': ['l1','l2'],
    'C': np.logspace(-4,4,9)
}

In [83]:
grid = GridSearchCV(estimator = logreg, param_grid = param_grid, cv = 10)

In [85]:
grid.fit(train.iloc[:,2:], train['Survived'])









GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [88]:
grid.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [89]:
logreg.set_params(C = 0.1, penalty = 'l2')

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [93]:
grid_results = pd.DataFrame(grid.cv_results_);



In [95]:
grid_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.003192,0.001113,0.001444,0.001188,0.0001,l1,"{'C': 0.0001, 'penalty': 'l1'}",0.611111,0.611111,0.617978,...,0.61596,0.61596,0.61596,0.61596,0.61596,0.61596,0.61596,0.616438,0.616162,0.000317
1,0.002274,0.000309,0.000589,0.00011,0.0001,l2,"{'C': 0.0001, 'penalty': 'l2'}",0.622222,0.611111,0.674157,...,0.663342,0.657107,0.673317,0.670823,0.664589,0.662095,0.665835,0.665006,0.667792,0.006786
2,0.002023,8.6e-05,0.000553,6.2e-05,0.001,l1,"{'C': 0.001, 'penalty': 'l1'}",0.522222,0.622222,0.561798,...,0.583541,0.562344,0.592269,0.594763,0.588529,0.571072,0.571072,0.585305,0.582742,0.010485
3,0.002195,9.7e-05,0.00053,1.4e-05,0.001,l2,"{'C': 0.001, 'penalty': 'l2'}",0.6,0.611111,0.707865,...,0.685786,0.678304,0.688279,0.689526,0.687032,0.682045,0.684539,0.686177,0.686621,0.004211
4,0.002837,0.000311,0.000584,0.000136,0.01,l1,"{'C': 0.01, 'penalty': 'l1'}",0.588889,0.577778,0.696629,...,0.678304,0.669576,0.673317,0.677057,0.668329,0.673317,0.680798,0.672478,0.675522,0.004733


In [90]:
logreg.fit(train.iloc[:,2:], train['Survived'])



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [92]:
logreg_preds = logreg.predict(test.iloc[:,1:])

In [96]:
logreg_predictions = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': logreg_preds
})
logreg_predictions.to_csv('../data/titanic/logreg_predictions1.csv',
                       index = False)

In [97]:
grid_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_penalty', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'split5_test_score', 'split6_test_score',
       'split7_test_score', 'split8_test_score', 'split9_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score',
       'split0_train_score', 'split1_train_score', 'split2_train_score',
       'split3_train_score', 'split4_train_score', 'split5_train_score',
       'split6_train_score', 'split7_train_score', 'split8_train_score',
       'split9_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [98]:
cols = ['param_C', 
        'param_penalty', 
        'mean_test_score', 
        'std_test_score', 
        'rank_test_score', 
        'mean_train_score', 
        'std_train_score']

In [99]:
grid_results = grid_results.loc[:, cols]

In [101]:
grid_results.sort_values('rank_test_score')

Unnamed: 0,param_C,param_penalty,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
7,0.1,l2,0.794613,0.029488,1,0.802719,0.004249
6,0.1,l1,0.792368,0.025906,2,0.792869,0.003117
9,1.0,l2,0.790123,0.026694,3,0.800101,0.002932
8,1.0,l1,0.789001,0.022805,4,0.79823,0.00584
11,10.0,l2,0.789001,0.025326,4,0.798105,0.004478
15,1000.0,l2,0.786756,0.026893,6,0.798853,0.004573
14,1000.0,l1,0.786756,0.026893,6,0.798852,0.00458
13,100.0,l2,0.786756,0.026893,6,0.798603,0.004612
12,100.0,l1,0.786756,0.026893,6,0.798852,0.00458
10,10.0,l1,0.786756,0.026893,6,0.798728,0.00429


In [102]:
grid.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [104]:
rfc.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [109]:
rfc_param_grid = {
    'max_features': [1,2,0.5,'sqrt','log2'],
    'min_samples_leaf': [1,2,3,5,10],
    'n_estimators': [1,5,10,25,100,1000]
}

In [110]:
rfc_grid = GridSearchCV(estimator = rfc, param_grid = rfc_param_grid, cv = 10)

In [111]:
rfc_grid.fit(train.iloc[:,2:], train['Survived'])



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': [1, 2, 0.5, 'sqrt', 'log2'], 'min_samples_leaf': [1, 2, 3, 5, 10], 'n_estimators': [1, 5, 10, 25, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [112]:
rfc_grid.best_params_

{'max_features': 0.5, 'min_samples_leaf': 2, 'n_estimators': 100}

In [116]:
rfc.set_params(max_features = 0.5,
               min_samples_leaf = 2,
               n_estimators = 100)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [117]:
rfc.fit(train.iloc[:,2:], train['Survived'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [119]:
rfc_cv_preds = rfc.predict(test.iloc[:,1:])

In [120]:
rfc_cv_predictions = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': rfc_cv_preds
})
rfc_cv_predictions.to_csv('../data/titanic/rfc_cv_predictions0.csv',
                       index = False)