In [142]:
# following the kaggle competition tutorial from https://www.dataquest.io/mission/74/getting-started-with-kaggle/
import matplotlib
import pandas as pd
import os.path
import numpy as np

In [143]:
pathName="C:/GIT/kaggle/titanic"
fileName = "train.csv"
filePath = os.path.join(pathName, fileName)
titanic = pd.read_csv(filePath)

In [144]:
print(titanic.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   
5                                   Moran, Mr. James    male  NaN      0   
6                            McCarthy, Mr. Timothy J    male   54      

In [145]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [146]:
titanic["Age"]


0     22
1     38
2     26
3     35
4     35
5    NaN
6     54
7      2
8     27
9     14
10     4
11    58
12    20
13    39
14    14
...
876    20
877    19
878   NaN
879    56
880    25
881    33
882    22
883    28
884    25
885    39
886    27
887    19
888   NaN
889    26
890    32
Name: Age, Length: 891, dtype: float64

In [147]:
# need to deal with NaN's
# will substitute median age for NaN
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

In [148]:
titanic["Age"][0:10]

0    22
1    38
2    26
3    35
4    35
5    28
6    54
7     2
8    27
9    14
Name: Age, dtype: float64

In [149]:
# replace string columns with numeric values for later ML model
titanic.loc[titanic["Sex"]=="male","Sex"] = 0
titanic.loc[titanic["Sex"]=="female","Sex"] = 1

In [150]:
# make sure we didn't miss any other possibilities
print(titanic["Sex"].unique())

[0 1]


In [151]:
titanic["Sex"][0:5]


0    0
1    1
2    1
3    1
4    0
Name: Sex, dtype: object

In [152]:
print(titanic["Embarked"].unique())

['S' 'C' 'Q' nan]


In [153]:
# remove NaN's and set numerics
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2

In [154]:
titanic.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [155]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [156]:
# Initialize our algorithm class
alg=LinearRegression()

# choose columns for predictors
predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]

# Generate cross validation folds for the titanic dataset.  It returns the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf=KFold(titanic.shape[0], n_folds=3, random_state=1)

In [157]:
# initialize our result list
predictions=[]

# loop through the train rowsets and test rowsets and generate predictions
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = titanic[predictors].iloc[train,:]
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)


In [158]:
# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
# note, we already imported numpy as np at the beginning
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0

# this doesn't work and I don't understand why
#correct_pred=0
#for i in predictions:
#    if(predictions[i]==titanic["Survived"].loc[i]):
#        correct_pred=correct_pred+1.0
#        print(correct_pred)

# now figure out what our error rate/success rate was
accuracy = sum(predictions[predictions==titanic["Survived"]])/len(predictions)
print(accuracy)
    

0.7833894500561167




In [159]:
# see if I can get better accuracy with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# output mean of the scores
print(scores.mean())

0.787878787879


In [160]:
# now do the same process with the test data in order to generate a submission for the competition on kaggle
pathName="C:/GIT/kaggle/titanic"
fileName = "test.csv"
filePath = os.path.join(pathName, fileName)
titanic_test = pd.read_csv(filePath)

In [161]:
# need to deal with NaN's in Age column
# will substitute median age from training set for NaN
# since that is what we trained the logistic regression model on
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())

# replace string columns with numeric values for later ML model

# Sex column
titanic_test.loc[titanic_test["Sex"]=="male","Sex"] = 0
titanic_test.loc[titanic_test["Sex"]=="female","Sex"] = 1

# Embarked column
# remove NaN's and set numerics
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"]=="S","Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"]=="C","Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"]=="Q","Embarked"] = 2

# Fare column
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())





In [165]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# output mean of the scores
print(scores.mean())

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

# output result to csv for submission to kaggle
submission.to_csv("C:/GIT/kaggle/titanic/kaggle.csv", index=False)

0.787878787879
