## Instructions

https://docs.google.com/document/d/1pEVR4fFqT8nb0NMPu_vplNSAjVYBgWUKGVJzyhHZaS4/edit <br/>
#### Feature Description
http://lib.stat.cmu.edu/S/Harrell/data/descriptions/titanic3info.txt<br/>

## Useful Examples

https://datastudentblog.wordpress.com/2014/03/31/titanic-how-we-got-to-12-on-the-kaggle-com-leaderboard/<br/>
http://nbviewer.ipython.org/github/agconti/kaggle-titanic/blob/master/Titanic.ipynb <br/>
http://www.markhneedham.com/blog/2013/11/09/python-making-scikit-learn-and-pandas-play-nice/ <br/>
http://nbviewer.ipython.org/github/datapress/kaggle-titanic/blob/master/notebooks/Section%202-2%20-%20SVM%20with%20Parameter%20Tuning.ipynb<br/>
http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/<br/>
#### Cross Validation
https://randomforests.wordpress.com/2014/02/02/basics-of-k-fold-cross-validation-and-gridsearchcv-in-scikit-learn/

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from math import sqrt
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm, cross_validation, grid_search
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 
import matplotlib as plt
import string
from scipy.stats import mode, zscore
%matplotlib inline

## Hack: Yes or No?

In [2]:
# hack = True
hack = False

## Read in Data

In [3]:
home_dir = ""
if hack:
    home_dir = "hack/"

training_data = pd.read_csv(home_dir + "train.csv")
training_data.columns = map(string.lower, training_data.columns)
training_data["set_type"] = "training"

testing_data = pd.read_csv(home_dir + "test.csv")
testing_data.columns = map(string.lower, testing_data.columns)
testing_data["set_type"] = "testing"

if hack:
    training_data = training_data.rename(columns = {'passengerid':'passenger_id'})
    testing_data = testing_data.rename(columns = {'passengerid':'passenger_id'}) 

all_data = pd.concat([training_data, testing_data])

In [4]:
all_data.head()

Unnamed: 0,age,cabin,embarked,fare,home.dest,name,parch,passenger_id,pclass,set_type,sex,sibsp,survived,ticket
0,39.0,C85,C,71.2833,"New York, NY","Cumings, Mr. John Bradley",0,0,1,training,male,1,0,PC 17599
1,39.0,,S,211.3375,,"Kreuchen, Miss. Emilie",0,1,1,training,female,0,1,24160
2,36.0,,S,7.4958,"Bulgaria Chicago, IL","Coleff, Mr. Peju",0,2,3,training,male,0,0,349210
3,,,S,8.05,,"Spector, Mr. Woolf",0,3,3,training,male,0,0,A.5. 3236
4,20.0,,C,15.7417,,"Nakid, Mr. Sahid",1,4,3,training,male,1,1,2653


In [5]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 523
Data columns (total 14 columns):
age             1046 non-null float64
cabin           295 non-null object
embarked        1307 non-null object
fare            1308 non-null float64
home.dest       745 non-null object
name            1309 non-null object
parch           1309 non-null int64
passenger_id    1309 non-null int64
pclass          1309 non-null int64
set_type        1309 non-null object
sex             1309 non-null object
sibsp           1309 non-null int64
survived        785 non-null float64
ticket          1309 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 153.4+ KB


In [6]:
all_data.describe()

Unnamed: 0,age,fare,parch,passenger_id,pclass,sibsp,survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,785.0
mean,29.881138,33.295479,0.385027,654.0,2.294882,0.498854,0.375796
std,14.413493,51.758668,0.86556,378.020061,0.837836,1.041658,0.484637
min,0.17,0.0,0.0,0.0,1.0,0.0,0.0
25%,21.0,7.8958,0.0,327.0,2.0,0.0,0.0
50%,28.0,14.4542,0.0,654.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,981.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,1308.0,3.0,8.0,1.0


#### We have no null values

In [7]:
all_data[all_data["set_type"] == "training"]["survived"].sum()

295.0

## Handling Missing Values

In [347]:
# Might be too extreme
# Remove features that have lots of missing values
# all_data = all_data.drop(['ticket','cabin'], axis=1)
# Remove NaN values
# all_data = all_data.dropna()

In [348]:
# age_mean = all_data['age'].mean()
# all_data['age'] = all_data['age'].fillna(age_mean)
# print all_data[all_data['age'].isnull()]

# mode_embarked = mode(all_data['embarked'])[0][0]
# all_data['embarked'] = all_data['embarked'].fillna(mode_embarked)
for c in range(1,4):
    # Fare
    mask = (all_data['fare'].isnull()) & (all_data['pclass'] == c)
    all_data['fare'][mask] = all_data['fare'][all_data['pclass'] == c].mean()
    
    # Age
    mask = (all_data['age'].isnull()) & (all_data['pclass'] == c)
    all_data['age'][mask] = all_data['age'][all_data['pclass'] == c].mean()
    # Embarked
    mask = (all_data['embarked'].isnull()) & (all_data['pclass'] == c)
    all_data['embarked'][mask] = mode(all_data['embarked'][all_data['pclass'] == c])[0][0]

In [349]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 523
Data columns (total 14 columns):
age             1309 non-null float64
cabin           295 non-null object
embarked        1309 non-null object
fare            1309 non-null float64
home.dest       745 non-null object
name            1309 non-null object
parch           1309 non-null int64
passenger_id    1309 non-null int64
pclass          1309 non-null int64
set_type        1309 non-null object
sex             1309 non-null object
sibsp           1309 non-null int64
survived        785 non-null float64
ticket          1309 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 153.4+ KB


## Feature Construction

#### Nationality

In [350]:
us = ['WA', 'WI', 'WV', 'FL', 'WY', 'NH', 'NJ', 'NM', 'NA', 'NC', 'ND', 'NE', 'NY', 'RI', 'NV', 'GU', 'CO', 'CA', 'GA', 'CT', 'OK', 'OH', 'KS', 'SC', 'KY', 'OR', 'SD', 'DE', 'DC', 'HI', 'PR', 'TX', 'LA', 'TN', 'PA', 'VA', 'VI', 'AK', 'AL', 'AS', 'AR', 'VT', 'IL', 'IN', 'IA', 'AZ', 'ID', 'ME', 'MD', 'MA', 'UT', 'MO', 'MN', 'MI', 'MT', 'MP', 'MS']
ca = ['ON', 'AB', 'NL', 'BC', 'NB', 'MB', 'YT', 'SK', 'QC', 'PE', 'NS', 'NT', 'NU']

In [351]:
def is_state(test_list):
    if type(test_list) == list:
        if np.array([t.strip() in us for t in test_list.split(",")]).any():
            return "US"
        if np.array([t.strip() in ca for t in test_list.split(",")]).any():
            return "CA"
    else:
        return "Unknown"
if not hack:
    all_data["ending_in_na"] = all_data["home.dest"].apply(is_state)

#### Binary Gender Class

In [352]:
all_data["female_high_class"] = (all_data['sex'] == "female") & (all_data['pclass'] == 1)

#### Title

In [353]:
all_data["title"] = ""
all_data["title"][all_data["name"].str.contains('Sir[\. ]|Capt[\. ]|Col[\. ]|Don[\. ]|Dr[\. ]|Jonkheer[\. ]|Major[\. ]|Rev[\. ]|Sir[\. ]')] = "noble"
all_data["title"][all_data["name"].str.contains('Mme[\. ]|Mlle[\. ]')] = "mlle"
all_data["title"][all_data["name"].str.contains('Dona[\. ]|Countess[\. ]|Lady[\. ]')] = "lady"

#### Has Title

In [354]:
all_data["has_title"] = all_data["title"] != ""

#### Family Size

In [355]:
all_data["family_size"] = all_data.sibsp + all_data.parch + 1

#### Travelling Alone?

In [356]:
all_data["travelling_alone"] = all_data["family_size"] <= 1

#### Place on Ship

In [357]:
all_data["side"] = "unknown"
all_data["cabin_last_digit"] = all_data.cabin.apply(lambda x: int(0 if str(x)[-1:] in string.letters else str(x)[-1:]))
all_data["side"][all_data["cabin_last_digit"] % 2 == 0] = "port"
all_data["side"][all_data["cabin_last_digit"] % 2 != 0] = "starboard"
all_data["side"][all_data["cabin_last_digit"] == 0] = "unknown"
all_data.drop("cabin_last_digit", axis=1, inplace=True)

#### Classifying Fares

In [358]:
all_data["fare_bin"] = "<10"
all_data["fare_bin"][all_data["fare"] >= 10] = "10-20"
all_data["fare_bin"][all_data["fare"] >= 20] = "20-30"
all_data["fare_bin"][all_data["fare"] >= 30] = "30+"

#### Passenger Name Length

In [359]:
all_data["name_length"] = all_data["name"].apply(len)

#### Adding Variables Names

In [360]:
all_data["has_cabin_info"] = all_data["cabin"].apply(lambda x: 1 if x > 0 else 0)

#### zscore age

In [361]:
mean_age = all_data["age"].mean()
stdev_age = np.std(all_data["age"])
all_data["z_age"] = map(lambda x: (x - mean_age) / stdev_age, all_data["age"])
all_data["z_age"].count()

1309

#### age bin

In [362]:
all_data["age_bin"] = "<18"
all_data["age_bin"][all_data["age"] >= 16] = "18-50"
all_data["age_bin"][all_data["age"] >= 50] = "50+"

In [363]:
all_data

Unnamed: 0,age,cabin,embarked,fare,home.dest,name,parch,passenger_id,pclass,set_type,...,title,has_title,family_size,travelling_alone,side,fare_bin,name_length,has_cabin_info,z_age,age_bin
0,39.000000,C85,C,71.2833,"New York, NY","Cumings, Mr. John Bradley",0,0,1,training,...,,False,2,False,starboard,30+,25,1,0.735495,18-50
1,39.000000,,S,211.3375,,"Kreuchen, Miss. Emilie",0,1,1,training,...,,False,1,True,unknown,30+,22,0,0.735495,18-50
2,36.000000,,S,7.4958,"Bulgaria Chicago, IL","Coleff, Mr. Peju",0,2,3,training,...,,False,1,True,unknown,<10,16,0,0.506886,18-50
3,24.816367,,S,8.0500,,"Spector, Mr. Woolf",0,3,3,training,...,,False,1,True,unknown,<10,18,0,-0.345341,18-50
4,20.000000,,C,15.7417,,"Nakid, Mr. Sahid",1,4,3,training,...,,False,3,False,unknown,10-20,16,0,-0.712363,18-50
5,21.000000,B57 B59 B63 B66,C,262.3750,"Haverford, PA / Cooperstown, NY","Ryerson, Miss. Susan Parker ""Suzette""",2,5,1,training,...,,False,5,False,port,30+,37,1,-0.636160,18-50
6,16.000000,,S,7.6500,"Norway Los Angeles, CA","Abelseth, Miss. Karen Marie",0,6,3,training,...,,False,1,True,unknown,<10,27,0,-1.017175,18-50
7,24.816367,,Q,15.5000,,"Murphy, Miss. Katherine ""Kate""",0,7,3,training,...,,False,2,False,unknown,10-20,30,0,-0.345341,18-50
8,42.000000,,S,32.5000,"Greenport, NY","Drew, Mr. James Vivian",1,8,2,training,...,,False,3,False,unknown,30+,22,0,0.964104,18-50
9,31.000000,E39 E41,C,134.5000,,"Wilson, Miss. Helen Alice",0,9,1,training,...,,False,1,True,starboard,30+,25,1,0.125871,18-50


## Transform Variables

In [365]:
all_data.dtypes

age                  float64
cabin                 object
embarked              object
fare                 float64
home.dest             object
name                  object
parch                  int64
passenger_id           int64
pclass                 int64
set_type              object
sex                   object
sibsp                  int64
survived             float64
ticket                object
ending_in_na          object
female_high_class       bool
title                 object
has_title               bool
family_size            int64
travelling_alone        bool
side                  object
fare_bin              object
name_length            int64
has_cabin_info         int64
z_age                float64
age_bin               object
dtype: object

In [366]:
training_data = all_data[all_data["set_type"] == "training"]
testing_data = all_data[all_data["set_type"] == "testing"]
Y = training_data["survived"]

## Set Features

In [377]:
numeric_features = [col for col in all_data.columns if all_data[col].dtype in [float, int] and col != "survived"]
categorical_features = [col for col in all_data.columns if all_data[col].dtype in [object]]
numeric_features = ["pclass", "family_size", "female_high_class", "travelling_alone"]
categorical_features = ["sex", "fare_bin", "embarked", "age_bin"]

X_train_num = training_data[numeric_features].as_matrix()
X_test_num = testing_data[numeric_features].as_matrix()

training_data[categorical_features].fillna('NA', inplace=True)
testing_data[categorical_features].fillna('NA', inplace=True)

v = DictVectorizer(sparse = False).fit(all_data[categorical_features].T.to_dict().values())
X_train_cat = v.transform(training_data[categorical_features].T.to_dict().values())
X_test_cat = v.transform(testing_data[categorical_features].T.to_dict().values())

# Combine Numeric and Categorical

X_train = np.hstack(( X_train_num, X_train_cat ))
X_test = np.hstack(( X_test_num, X_test_cat ))

print X_train.shape, X_train_num.shape, X_train_cat.shape
print X_test.shape, X_test_num.shape, X_test_cat.shape

(785, 16) (785, 4) (785, 12)
(524, 16) (524, 4) (524, 12)


### SVM Model

#### Train the model

In [378]:
# kf_total = cross_validation.KFold(len(X_train), n_folds=5, shuffle=True, random_state=4)
clf = svm.SVC()
# clf = svm.LinearSVC()
# clf = svm.SVC(kernel='poly', gamma=3) 
parameter_grid = {
    'kernel' : ['rbf'],
    'gamma': [0.0,0.1,0.01],
    'C': [1., 10., 100.]
}
clfgs = grid_search.GridSearchCV(estimator=clf, param_grid=parameter_grid, n_jobs=1, cv=5) # , verbose=3
clfgs.fit(X_train, Y)
print clfgs.best_estimator_
print clfgs.best_score_

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.1,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
0.829299363057


#### Predict on the training set

In [379]:
train_predictions = clfgs.predict(X_train)
train_predictions = zip(training_data["passenger_id"], train_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
train_predictions = pd.DataFrame(train_predictions, columns=output_columns)
print train_predictions.sum()
print len(train_predictions)

passenger_id    307720
survived           256
dtype: int64
785


In [380]:
train_predictions.to_csv(home_dir + "train_rbfsvm_c10_g01.csv", index=False)

#### Predict on the test set

In [381]:
test_predictions = clfgs.predict(X_test)
final_predictions = zip(testing_data["passenger_id"],test_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
final_predictions = pd.DataFrame(final_predictions, columns=output_columns)
print final_predictions.sum()
print len(final_predictions)

passenger_id    548366
survived           207
dtype: int64
524


In [382]:
final_predictions.to_csv(home_dir + "9_rbfsvm_c100_g01.csv", index=False)

# Everything Below is Experimentation

## Random Forests

#### http://scikit-learn.org/stable/modules/ensemble.html#parameters

In [383]:
# Create the random forest object which will include all the parameters
# for the fit
n_features = len(categorical_features) + len(numeric_features)
forest = RandomForestClassifier(n_estimators = 1000, max_features=int(round(sqrt(n_features))), max_depth=None, min_samples_split=1)

scores = sk.cross_validation.cross_val_score(forest, X_train, Y)
print scores.mean()

0.793569628367


In [384]:
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(X_train,Y)

# Take the same decision trees and run it on the test data
test_predictions = forest.predict(X_test)

In [385]:
final_predictions = zip(testing_data["passenger_id"],test_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
final_predictions = pd.DataFrame(final_predictions, columns=output_columns)
print final_predictions.sum()
print len(final_predictions)

passenger_id    548366
survived           193
dtype: int64
524


In [386]:
final_predictions.to_csv(home_dir + "7_random_forest.csv", index=False)

In [387]:
train_predictions = forest.predict(X_train)
train_predictions = zip(training_data["passenger_id"], train_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
train_predictions = pd.DataFrame(train_predictions, columns=output_columns)
print train_predictions.sum()
print len(train_predictions)

passenger_id    307720
survived           249
dtype: int64
785


In [388]:
train_predictions.to_csv(home_dir + "train_random_forest.csv", index=False)

## Adaboost

In [404]:
clf = AdaBoostClassifier(n_estimators=100)

# Fit the training data to the Survived labels and create the decision trees
clf = clf.fit(X_train,Y)

# Take the same decision trees and run it on the test data
test_predictions = clf.predict(X_test)

scores = sk.cross_validation.cross_val_score(clf, X_train, Y)
scores.mean() 

0.80376731786198163

In [405]:
final_predictions = zip(testing_data["passenger_id"], test_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
final_predictions = pd.DataFrame(final_predictions, columns=output_columns)
print final_predictions.sum()
print len(final_predictions)

passenger_id    548366
survived           210
dtype: int64
524


In [409]:
final_predictions.to_csv(home_dir + "1_adaboost.csv", index=False)

In [410]:
train_predictions = clf.predict(X_train)
train_predictions = zip(training_data["passenger_id"], train_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
train_predictions = pd.DataFrame(train_predictions, columns=output_columns)
print train_predictions.sum()
print len(train_predictions)

passenger_id    307720
survived           251
dtype: int64
785


In [411]:
train_predictions.to_csv(home_dir + "train_1_adaboost.csv", index=False)

## Decision Trees

In [412]:
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, Y)
test_predictions = dt.predict(X_test)

In [413]:
final_predictions = zip(testing_data["passenger_id"],test_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
final_predictions = pd.DataFrame(final_predictions, columns=output_columns)
print final_predictions.sum()
print len(final_predictions)

passenger_id    548366
survived           193
dtype: int64
524


In [414]:
final_predictions.to_csv(home_dir + "2_decision_tree.csv", index=False)

In [415]:
train_predictions = dt.predict(X_train)
train_predictions = zip(training_data["passenger_id"], train_predictions.astype(int))
output_columns = "passenger_id", "survived"
if hack:
    output_columns = "PassengerId", "Survived"
train_predictions = pd.DataFrame(train_predictions, columns=output_columns)
print train_predictions.sum()
print len(train_predictions)

passenger_id    307720
survived           243
dtype: int64
785


In [416]:
train_predictions.to_csv(home_dir + "train_2_decisiontree.csv", index=False)

## Archive

In [None]:
cross_validation.cross_val_score(clfgs, categorical_training_data, training_data["survived"], cv=kf_total, n_jobs = 1)