In [53]:
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')
data.info()
# data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [54]:
########################## DATA CLEANING ########################

# CABIN
# fillin na for cabin
# is a pattern in the cabin name, then create new col and/or dummies

# SEX
# convert male/female to boolean, (sex_bool, True/False, 1/0)
# also try sex amped

# Parch = if parent/child.
# create a 1/0 column

# SibSp
# the binary column 0/1 seems to hurt k nearest neighbors score

# PCLASS
# covert Pclass to dummies

# EMBARKED
# see the numbers of uniques with Embarked
# Embarked:
    # convert to port names
    # fillna as no port
    # then convert Embarked to dummy values

# AGE
# describe() to see the avg age in certain groups. 
    # combos of sex, Pclass, Fare
    # pick on with the lowest stdev    
# then replace age NAs with the avg from the matching subgroup
# if stdev of age is too high, then remove unknown ages

# AGE --> child, adult, elder --> dummies
# convert into a smaller amount of division, into fewer bins


In [55]:
data.Cabin.fillna('no_cabin', inplace=True)
data.Cabin.unique()

array(['no_cabin', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87',
       'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
       'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
       'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111',
       'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20',
       'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86',
       'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19',
       'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71',
       'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64',

In [56]:
cabin_starts = [x[0] for x in data.Cabin]

data['cabin_start'] = cabin_starts

data.cabin_start.replace('n', "no_cabin", inplace=True)
data.cabin_start.unique()

array(['no_cabin', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [57]:
cabin_start_dummies = pd.get_dummies(data.cabin_start, prefix='cabin_start')

data = pd.merge(data,cabin_start_dummies,left_index=True,right_index=True)


In [58]:
data.Parch.unique()


array([0, 1, 2, 5, 3, 4, 6])

In [59]:
data['parch_binary'] = data.Parch
data.parch_binary.replace([2, 5, 3, 4, 6],[1,1,1,1,1],inplace=True)


In [60]:
data.SibSp.unique()

array([1, 0, 3, 4, 2, 5, 8])

In [61]:
data['SibSp_binary'] = data.SibSp
data.SibSp_binary.replace([1, 0, 3, 4, 2, 5, 8],[0,0, 1, 1, 1, 1, 1],inplace=True)

data['SibSp_amped'] = [x*0.5 for x in data.SibSp]

In [62]:
data.Sex.unique()

array(['male', 'female'], dtype=object)

In [63]:
data['Sex_bool'] = data.Sex
data.Sex_bool.replace(['male','female'],[1,0], inplace=True)

data['sex_amped'] = [x*6 for x in data.Sex_bool]
# data.info()

In [64]:
# data.Pclass.unique()
class_dummies = pd.get_dummies(data.Pclass, prefix='Class')
# class_dummies.head()
data1 = pd.merge(data, class_dummies, left_index=True, right_index=True)
# data1.info()

In [65]:
#  NO!  this hurt the k nearest neighbors result

# try amping the p_class impact
# data1['Class_1'] = [x*2 for x in data1.Class_1]
# data1['Class_2'] = [x*2 for x in data1.Class_2]
# data1['Class_3'] = [x*2 for x in data1.Class_3]


In [66]:
data1.Embarked.fillna('unknown',inplace=True)
data1.Embarked.replace(['S','C','Q'],['Southampton','Cherbourg','Queenstown'], inplace=True)
data1.Embarked.unique()

array(['Southampton', 'Cherbourg', 'Queenstown', 'unknown'], dtype=object)

In [67]:
port_dummies = pd.get_dummies(data1.Embarked, prefix='Port')
# port_dummies.head()
data2 = pd.merge(data1, port_dummies, left_index=True, right_index=True)
# data2.info()

In [68]:
# FIGURE OUT WHAT REPLACEMENT AGES TO USE

df_have_ages = data2.dropna(subset=['Age'])
df_have_ages.groupby(['Pclass','Sex'])['Age'].std()

# CONCLUSION -- stdev on ages are too great. just exclude records with no age.

Pclass  Sex   
1       female    13.612052
        male      15.139570
2       female    12.872702
        male      14.793894
3       female    12.729964
        male      12.159514
Name: Age, dtype: float64

In [69]:
data3 = df_have_ages.copy()

# convert ages into baby (<=2), child(2<=x<13), teen(13<=x<18), adult, elder(>55)

age_groups = []
for age in data3.Age:
    if age <= 2:
        age_groups.append("baby")
    if age >2 and age < 13:
        age_groups.append("child")
    if age >=13 and age <18 :
        age_groups.append("teen")
    if age >=18 and age <55 :
        age_groups.append("adult")
    if age >=55 :
        age_groups.append("elder")

# print len(age_groups)
data3['age_group'] = age_groups

# convert to dummies
age_dummies = pd.get_dummies(data3.age_group, prefix='age')
data3 = pd.merge(data3, age_dummies, left_index=True, right_index=True)

data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 40 columns):
PassengerId             714 non-null int64
Survived                714 non-null int64
Pclass                  714 non-null int64
Name                    714 non-null object
Sex                     714 non-null object
Age                     714 non-null float64
SibSp                   714 non-null int64
Parch                   714 non-null int64
Ticket                  714 non-null object
Fare                    714 non-null float64
Cabin                   714 non-null object
Embarked                714 non-null object
cabin_start             714 non-null object
cabin_start_A           714 non-null float64
cabin_start_B           714 non-null float64
cabin_start_C           714 non-null float64
cabin_start_D           714 non-null float64
cabin_start_E           714 non-null float64
cabin_start_F           714 non-null float64
cabin_start_G           714 non-null float64
cabin_start

In [70]:
################### k_folds and cross-validation ###################

In [71]:
# our splitting of data, and cross validation
from sklearn.cross_validation import KFold

In [72]:
# cross validation function with average score
def cross_validate(features, target, classifier, k_fold) :

    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(features), n_folds=k_fold,
                           shuffle=True, random_state=0)

    k_score_total = 0
    
    # for each training and testing slices run the classifier, and score the results
    for train_slice, test_slice in k_fold_indices :

        model = classifier(features[train_slice],
                           target[train_slice])

        k_score = model.score(features[test_slice],
                              target[test_slice])

        k_score_total += k_score

    # return the average accuracy
    return k_score_total/k_fold

In [73]:
################### k nearest neighbor ###################

In [74]:
# the model
from sklearn.neighbors import KNeighborsClassifier

In [75]:
# which features to use for this model

# TODO: data3.SibSp

# 0.76193270735524254
# features = zip(data3.SibSp, data3.parch_binary, data3.cabin_start_A, data3.cabin_start_B, data3.cabin_start_C, data3.cabin_start_D, data3.cabin_start_E, data3.cabin_start_F, data3.cabin_start_G, data3.cabin_start_T, data3.Sex_bool, data3.Class_1, data3.Class_2, data3.Class_3, data3.Port_Cherbourg, data3.Port_Queenstown, data3.Port_Southampton)

# RESULT = ages as binary/dummies is important
# 0.78159233176838805   added binary ages (dummies)
# features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp, data3.parch_binary, data3.cabin_start_A, data3.cabin_start_B, data3.cabin_start_C, data3.cabin_start_D, data3.cabin_start_E, data3.cabin_start_F, data3.cabin_start_G, data3.cabin_start_T, data3.Sex_bool, data3.Class_1, data3.Class_2, data3.Class_3, data3.Port_Cherbourg, data3.Port_Queenstown, data3.Port_Southampton)

# RESULT = removing ports was a good thing
# 0.80397104851330192
# features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp, data3.parch_binary, data3.cabin_start_A, data3.cabin_start_B, data3.cabin_start_C, data3.cabin_start_D, data3.cabin_start_E, data3.cabin_start_F, data3.cabin_start_G, data3.cabin_start_T, data3.Sex_bool, data3.Class_1, data3.Class_2, data3.Class_3)

# RESULT = removing first letter cabins
# 0.81107198748043818
# features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp, data3.parch_binary, data3.Sex_bool, data3.Class_1, data3.Class_2, data3.Class_3)

# RESULT = amplifying the gender impact
# 0.81381064162754302 *3
# 0.81666666666666665  *6  *12
# features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp, data3.parch_binary, data3.sex_amped, data3.Class_1, data3.Class_2, data3.Class_3)

# RESULT = added back in cabin letter
# 0.8180164319248826
features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp, data3.parch_binary, data3.sex_amped, data3.Class_1, data3.Class_2, data3.Class_3, data3.cabin_start_A, data3.cabin_start_B, data3.cabin_start_C, data3.cabin_start_D, data3.cabin_start_E, data3.cabin_start_F, data3.cabin_start_G, data3.cabin_start_T)



features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp, data3.parch_binary, data3.sex_amped, data3.Class_1, data3.Class_2, data3.Class_3, data3.cabin_start_A, data3.cabin_start_B, data3.cabin_start_C, data3.cabin_start_D, data3.cabin_start_E, data3.cabin_start_F, data3.cabin_start_G, data3.cabin_start_T)


# RESULT = SibSp_binary  dropped the score.  but, removing SibSp entirely will drop the score further
# 0.79972613458528952 --> 0.78153364632237876 
# features = zip(data3.age_adult, data3.age_baby, data3.age_child, data3.age_elder, data3.age_teen, data3.SibSp_binary, data3.parch_binary, data3.cabin_start_A, data3.cabin_start_B, data3.cabin_start_C, data3.cabin_start_D, data3.cabin_start_E, data3.cabin_start_F, data3.cabin_start_G, data3.cabin_start_T, data3.Sex_bool, data3.Class_1, data3.Class_2, data3.Class_3)



features = np.array(features)

# survived or not survived
target = data3.Survived.values

In [76]:
# run model
max_score = 0
for neighbors in range(3,20):
    new_score = cross_validate(features, target, KNeighborsClassifier(neighbors, weights='uniform').fit, 10)
    max_score = max(max_score, new_score)
    
    # track the best number of neighbors
    
max_score

0.8180164319248826

In [77]:
################### random forest ###################

In [78]:
# model
from sklearn.ensemble import RandomForestClassifier

In [79]:
# selecting features for model

In [80]:
# run model
model = RandomForestClassifier(random_state=0).fit

cross_validate(features, target, model, 10)

0.79704616588419408