In [1]:
# Useful imports
import csv as csv
import pandas as pd
import numpy as np

# Import the linear regression class
from sklearn.linear_model import LinearRegression

# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

from sklearn import cross_validation

from sklearn.linear_model import LogisticRegression

In [2]:
# Load training data
titanic = pd.read_csv('./datasets/train.csv')

print titanic.head(5)
print '------------------'
print titanic.describe()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
--



In [3]:
# Fill empty Age values with median
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

In [4]:
print titanic['Sex'].unique()

# Replace all the occurences of male with the number 0.
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1

['male' 'female']


In [5]:
print titanic['Embarked'].unique()

# Fill empty values with the most popular one 'S'
titanic['Embarked'] = titanic['Embarked'].fillna('S')

# Set numeric codes
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2

['S' 'C' 'Q' nan]


In [6]:
# Try linear regression model

# The columns we'll use to predict the target
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Initialize our algorithm class
alg = LinearRegression()

# Generate cross validation folds for the titanic dataset. It returns the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []

for train, test in kf:
    
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = titanic['Survived'].iloc[train]
    
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    
    predictions.append(test_predictions)

In [7]:
# The predictions are in three separate numpy arrays. Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0

# See what is the accuracy of the model
accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)
print accuracy

0.783389450056




In [8]:
# Try logistic regression model

# Initialize our algorithm
alg = LogisticRegression(random_state=1)

# Compute the accuracy score for all the cross validation folds (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)

# See what is the accuracy of the model
# Take the mean of the scores (because we have one for each fold)
print scores.mean()

0.787878787879


In [9]:
# Generate predictions on the test dataset
titanic_test = pd.read_csv('./datasets/test.csv')

# Clean test data

# Median value is from training set, as we did the same there! The value must be the same
titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median()) 
titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0
titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1

titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0
titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1
titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2

# Median value is from test set, as train one has no null values
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median()) 

In [10]:
# Use logistic regression

# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic['Survived'])

# Make predictions using the test set
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        'PassengerId': titanic_test['PassengerId'],
        'Survived': predictions
    })

# Save to file
submission.to_csv('./results/logisticregressionmodel.csv', index=False)