In [1]:
import sklearn.linear_model as linear_model
import sklearn
import seaborn as sns
import pandas as pd
import numpy as np

# Data Loading

Here I am using built-in data to make a quick example.  In practice I would probably want to download the data from some external source

In [2]:
data = sns.load_dataset("titanic")
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Data Processing

I need to extract some binary features

In [3]:
data2 = pd.get_dummies(data)
data2.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,True,False,0,1,...,0,0,0,0,0,0,0,1,1,0
1,1,1,38.0,1,0,71.2833,False,False,1,0,...,1,0,0,0,0,1,0,0,0,1
2,1,3,26.0,0,0,7.925,False,True,1,0,...,0,0,0,0,0,0,0,1,0,1
3,1,1,35.0,1,0,53.1,False,False,1,0,...,1,0,0,0,0,0,0,1,0,1
4,0,3,35.0,0,0,8.05,True,True,0,1,...,0,0,0,0,0,0,0,1,1,0


# Make the training matrices

In [4]:
data3 = data2.dropna()
Y = data3['survived'].values
# Drop columns contain the label
X = data3.drop(["survived"], axis=1).values.astype('float')

# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split
(X_tr, X_te, Y_tr, Y_te) = train_test_split(X, Y, test_size = 0.1, random_state=42)

# Model Development

First cut at model development

In [6]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 10
min_samples_split = 2

model = RandomForestClassifier(n_estimators=n_estimators, 
                               min_samples_split=min_samples_split)

model.fit(X_tr, Y_tr)

print("Train Accuracy:", model.score(X_tr, Y_tr))
print("Test Accuracy:", model.score(X_te, Y_te))

Train Accuracy: 1.0
Test Accuracy: 1.0


**Error!!!** 

The accuracy is too high!  We must have a feature that contains the label

In [7]:
data3.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone', 'sex_female', 'sex_male', 'embarked_C', 'embarked_Q',
       'embarked_S', 'class_First', 'class_Second', 'class_Third', 'who_child',
       'who_man', 'who_woman', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton', 'alive_no',
       'alive_yes'],
      dtype='object')

Notice the **alive_no** and **alive_yes** columns appear to have same data as survived.  Need to drop these columns

# Re-make the training matrices

In [8]:
Y = data3['survived'].values
# Drop columns contain the label
X = data3.drop(["survived", "alive_no", "alive_yes"], axis=1).values.astype('float')

# Train Test Split (Again)

In [9]:
from sklearn.model_selection import train_test_split
(X_tr, X_te, Y_tr, Y_te) = train_test_split(X, Y, test_size = 0.1, random_state=42)

# Model Development (Again)

First cut at model development

In [10]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 10
min_samples_split = 2

model = RandomForestClassifier(n_estimators=n_estimators, 
                               min_samples_split=min_samples_split,
                               random_state = 42)

model.fit(X_tr, Y_tr)

print("Train Accuracy:", model.score(X_tr, Y_tr))
print("Test Accuracy:", model.score(X_te, Y_te))

Train Accuracy: 0.968847352025
Test Accuracy: 0.736111111111


# Model selection through search

In [11]:
from sklearn.model_selection import KFold


n_estimators_values = [5,10,15,20,25,30] 
min_samples_split_values = [2,4,6,8,10]

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy = {}

for n_estimators in n_estimators_values:
    for min_samples_split in min_samples_split_values:
        # Accuracy numbers for each run
        kfold_accuracy = []
        
        # K Fold cross validation
        for (tr_ind, val_ind) in kfold.split(X_tr):
            model = RandomForestClassifier(n_estimators=n_estimators, 
                                           min_samples_split=min_samples_split,
                                           random_state = 42)
            model.fit(X_tr[tr_ind,:], Y_tr[tr_ind])
            acc = model.score(X_tr[val_ind,:], Y_tr[val_ind])

            kfold_accuracy.append(acc)
        
        # Compute the average accuracy for the kfold cross validation
        avg_acc = np.mean(kfold_accuracy)
        
        # Store the accuracy for the given parameter configuation
        accuracy[(n_estimators, min_samples_split)] = avg_acc


## Determine the configuration with highest accuracy

In [12]:
# Get the index of the config with highest accuracy
ind = np.argmax(list(accuracy.values()))

# Get the configuration parameters
(best_n_estimators, best_min_samples_split) = list(accuracy.keys())[ind]
print("best_n_estimators", best_n_estimators)
print("best_min_samples_split", best_min_samples_split)

best_n_estimators 10
best_min_samples_split 10


In [13]:
best_model = RandomForestClassifier(n_estimators=best_n_estimators, 
                                    min_samples_split=best_min_samples_split,
                                    random_state = 42)

best_model.fit(X_tr, Y_tr)

print("Best Train Accuracy:", best_model.score(X_tr, Y_tr))
print("Best Test Accuracy:", best_model.score(X_te, Y_te))

Best Train Accuracy: 0.884735202492
Best Test Accuracy: 0.763888888889
