In [70]:
import csv as csv
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 

In [71]:
# training data
train_df = pd.read_csv('data/train.csv',header=0)

In [72]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [73]:
# Store survived feature in a new variable and remove it from the data
outcomes = train_df['Survived']
train_df = train_df.drop('Survived',axis=1)

# with the separation and pandas these are still tied together, so train_df.loc[i] has the survival outcome of outcome[i]

In [74]:
train_df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [75]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [76]:
train_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,,0.0,0.0,7.9104
50%,446.0,3.0,,0.0,0.0,14.4542
75%,668.5,3.0,,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [77]:
# want to convert strings to integer classifiers
#age:
train_df['Gender'] = train_df['Sex'].map({'female':0,'male':1}).astype(int)

#embarked from 'C','Q','S', making as numbers isnt ideal because it implies some importance or order
#missings values in embarked...want to fill them in with similar other embarkees 
if len(train_df.Embarked[train_df.Embarked.isnull()]) > 0:
    train_df.Embarked[train_df.Embarked.isnull()] = train_df.Embarked.dropna().mode().values

Ports = list(enumerate(np.unique(train_df['Embarked'])))
Ports_dict = {name: i for i, name in Ports}
train_df.Embarked = train_df.Embarked.map(lambda x: Ports_dict[x]).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [78]:
train_df.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,2,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,0,0


In [79]:
# fill in missing values for age
median_ages = np.zeros((2,3))
for i in range(0,2):
    for j in range(0,3):
        median_ages[i,j] = train_df[(train_df['Pclass'] == j+1) & (train_df['Gender'] == i)]['Age'].dropna().median()
median_ages

array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])

In [80]:
# fill in the median ages for age
for i in range(0,2):
    for j in range(0,3):
        train_df.loc[(train_df.Age.isnull()) & (train_df.Gender == i) & (train_df.Pclass == j+1),'Age'] = median_ages[i,j]

In [81]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null int64
Gender         891 non-null int64
dtypes: float64(2), int64(6), object(4)
memory usage: 83.6+ KB


In [82]:
# drop the columns Im not going to be using for the classification
train_df_clean = train_df.drop(['Name','PassengerId','Sex','Ticket','Cabin'],axis=1)

<h3>Add new variables</h3>

In [99]:
train_df_clean.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender,FamilySize
0,3,22.0,1,0,7.25,2,1,1
1,1,38.0,1,0,71.2833,0,0,1
2,3,26.0,0,0,7.925,2,0,0
3,1,35.0,1,0,53.1,2,0,1
4,3,35.0,0,0,8.05,2,1,0


In [98]:
train_df_clean['FamilySize'] = train_df_clean['Parch'] + train_df_clean['SibSp']

In [83]:
train_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
Gender      891 non-null int64
dtypes: float64(2), int64(5)
memory usage: 48.8 KB


<h1>Test Data</h1>

In [102]:
# now need to make the test data match the new training data
test_df = pd.read_csv('data/test.csv',header=0)
test_df['Gender'] = test_df['Sex'].map({'female':0,'male':1}).astype(int)

#embarkation point:
if len(test_df.Embarked[test_df.Embarked.isnull()]) > 0:
    test_df.Embarked[test_df.Embarked.isnull()] = test_df.Embarked.dropna().mode().values

Ports = list(enumerate(np.unique(test_df['Embarked'])))
Ports_dict = {name: i for i, name in Ports}
test_df.Embarked = test_df.Embarked.map(lambda x: Ports_dict[x]).astype(int)

#Median ages
median_ages = np.zeros((2,3))
for i in range(0,2):
    for j in range(0,3):
        median_ages[i,j] = test_df[(test_df['Pclass'] == j+1) & (test_df['Gender'] == i)]['Age'].dropna().median()
median_ages
# fill in the median ages for age
for i in range(0,2):
    for j in range(0,3):
        test_df.loc[(test_df.Age.isnull()) & (test_df.Gender == i) & (test_df.Pclass == j+1),'Age'] = median_ages[i,j]

# missing fares
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
        
        
# adding FamilySize
test_df['FamilySize'] = test_df['Parch'] + test_df['SibSp']

In [103]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null int64
Gender         418 non-null int64
FamilySize     418 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 42.5+ KB


In [104]:
# collect the test data's PassengerIds before dropping
ids = test_df['PassengerId'].values
# Remove the unneeded columns
test_df_clean = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 

<h1>Training and prediction functions</h1>

In [106]:
def train_classifier(clf, X_train, y_train):
    clf.fit(X_train,y_train)
    print 'Trained Model'
    return clf
    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
#     start = time()
    y_pred = clf.predict(features)
#     end = time()
    
    # Print and return results
#     print "Made predictions in {:.4f} seconds.".format(end - start)
    return f1_score(target.values, y_pred, pos_label=1)

<h1>Linear Regression</h1>

In [117]:
train_df_clean.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender,FamilySize
0,3,22.0,1,0,7.25,2,1,1
1,1,38.0,1,0,71.2833,0,0,1


In [118]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

# The columns we'll use to predict the target
predictors = ["Pclass", "Gender", "Age", "SibSp", "Parch", "Fare", "Embarked","FamilySize"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(train_df_clean.shape[0], n_folds=10, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (train_df_clean[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = outcomes[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(train_df_clean[predictors].iloc[test,:])
    predictions.append(test_predictions)

In [122]:
#bring the predictions into one array
predictions = np.concatenate(predictions, axis=0)

In [123]:
len(predictions)

891

<h1>Logistic Regression with Cross fold validation</h1>

In [129]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegressionCV
# Initialize our algorithm
alg = LogisticRegressionCV(random_state=1,cv=10)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
alg = alg.fit(train_df_clean,outcomes)

In [130]:
print "Tuned model has a training F1 score of {:.4f}.".format(predict_labels(alg, train_df_clean, outcomes))

Tuned model has a training F1 score of 0.7404.


<h1>SVM</h1>

In [93]:
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
#     start = time()
    y_pred = clf.predict(features)
#     end = time()
    
    # Print and return results
#     print "Made predictions in {:.4f} seconds.".format(end - start)
    return f1_score(target.values, y_pred, pos_label=1)

In [113]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn import svm

parameters = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

#initialize classifier
clf = svm.SVC()

# make f1 scoring function using make_scorer
f1_scorer = make_scorer(f1_score,pos_label=1)

#grid search to get best parameters
grid_obj = GridSearchCV(clf, param_grid=parameters,scoring=f1_scorer)

#fit grid search object to training data to find optimal parameters
grid_obj = grid_obj.fit(train_df_clean,outcomes)

#get estimator
clf = grid_obj.best_estimator_

In [115]:
print "Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, train_df_clean, outcomes))

Tuned model has a training F1 score of 0.7888.


In [54]:
# start to train!
# convert back to numpy array
train_data = train_df_clean.values
test_data = test_df_clean.values

In [97]:
train_df_clean.head(4)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
0,3,22.0,1,0,7.25,2,1
1,1,38.0,1,0,71.2833,0,0
2,3,26.0,0,0,7.925,2,0
3,1,35.0,1,0,53.1,2,0


In [60]:
# train_data
test_data

array([[  3.    ,  34.5   ,   0.    , ...,   7.8292,   1.    ,   1.    ],
       [  3.    ,  47.    ,   1.    , ...,   7.    ,   2.    ,   0.    ],
       [  2.    ,  62.    ,   0.    , ...,   9.6875,   1.    ,   1.    ],
       ..., 
       [  3.    ,  38.5   ,   0.    , ...,   7.25  ,   2.    ,   1.    ],
       [  3.    ,  24.    ,   0.    , ...,   8.05  ,   2.    ,   1.    ],
       [  3.    ,  24.    ,   1.    , ...,  22.3583,   0.    ,   1.    ]])

In [116]:
# print 'Training...'
# forest = RandomForestClassifier(n_estimators=100)
# forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

print 'Predicting...'
output = clf.predict(test_df_clean)

predictions_file = open("models/svm.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'

Predicting...
Done.


<h1>KNN</h1>

In [107]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNN = KNeighborsClassifier(n_neighbors=3) 
clf_KNN = train_classifier(clf_KNN,train_df_clean,outcomes)
print "Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf_KNN, train_df_clean, outcomes))

Trained Model
Tuned model has a training F1 score of 0.8006.


In [109]:
print 'Predicting...'
output = clf_KNN.predict(test_df_clean)

predictions_file = open("models/knn.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'

Predicting...
Done.


<h1>Naive Bayes</h1>

In [110]:
from sklearn.naive_bayes import GaussianNB

clf_NB = GaussianNB()
clf_NB = train_classifier(clf_NB,train_df_clean,outcomes)
print "Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf_NB, train_df_clean, outcomes))

Trained Model
Tuned model has a training F1 score of 0.7251.


In [111]:
print 'Predicting...'
output = clf_NB.predict(test_df_clean)

predictions_file = open("models/gnb.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'

Predicting...
Done.


In [58]:
def accuracy_score(truth, pred):
    """ Returns accuracy score for input truth and predictions. """
    
    # Ensure that the number of predictions matches number of outcomes
    if len(truth) == len(pred): 
        
        # Calculate and return the accuracy as a percent
        return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
    
    else:
        return "Number of predictions does not match number of outcomes!"

In [None]:
print accuracy_score(test_data[0:,0])