In [1]:
import numpy as np
import pandas as pd

from IPython.display import display
%matplotlib inline

# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)

# Print the first few entries of the RMS Titanic data
display(full_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [2]:
outcomes = full_data['Survived']
data = full_data.drop('Survived', axis = 1)

# Show the new dataset with 'Survived' removed
display(data.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [3]:
def accuracy_score(truth, pred):
    """ Returns accuracy score for input truth and predictions. """
    
    # Ensure that the number of predictions matches number of outcomes
    if len(truth) == len(pred): 
        # Calculate and return the accuracy as a percent
        return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
    else:
        return "Number of predictions does not match number of outcomes!"
    
# Test the 'accuracy_score' function
predictions = pd.Series(np.ones(5, dtype = int))
print accuracy_score(outcomes[:5], predictions)

Predictions have an accuracy of 60.00%.


In [5]:
def predictions_0(data):
    """ Model with no features. Always predicts a passenger did not survive. """

    predictions = []
    for _, passenger in data.iterrows():
        
        # Predict the survival of 'passenger'
        predictions.append(0)
    
    # Return our predictions
    return pd.Series(predictions)

# Make the predictions
predictions = predictions_0(data)
print accuracy_score(outcomes, predictions)

Predictions have an accuracy of 59.38%.


In [6]:
def predictions_1(data):
    """ Model with one feature: 
            - Predict a passenger survived if they are female. """
    
    predictions = []
    for _, passenger in data.iterrows():
        
        # Remove the 'pass' statement below 
        # and write your prediction conditions here
        if passenger['Sex'] == "male":
            predictions.append(0)
        else:
            predictions.append(1)
    
    # Return our predictions
    return pd.Series(predictions)

# Make the predictions
predictions = predictions_1(data)
print accuracy_score(outcomes, predictions)

Predictions have an accuracy of 78.01%.


In [7]:
def predictions_2(data):
    """ Model with two features: 
            - Predict a passenger survived if they are female.
            - Predict a passenger survived if they are male and younger than 10. """
    
    predictions = []
    for _, passenger in data.iterrows(): 
        # Inferring from the visualization plot between age and number of persons for male and female
        if passenger['Sex'] == "male":
            if passenger['Age'] < 10:
                predictions.append(1)
            else:
                predictions.append(0)
        else:
            predictions.append(1)
    
    # Return our predictions
    return pd.Series(predictions)

# Make the predictions
predictions = predictions_2(data)
print accuracy_score(outcomes, predictions)

Predictions have an accuracy of 78.85%.


In [8]:
def predictions_3(data):
    """ Model with multiple features. Makes a prediction with an accuracy of at least 80%. """
    
    predictions = []
    for _, passenger in data.iterrows():
        # Implementation of decision rules for Pclass and age inferred from visualization plots
        if passenger['Sex'] == "male":
            if (passenger['Age'] < 38) and (passenger['Pclass']==1) :
                predictions.append(1)
            elif (passenger['Age'] < 18) and (passenger['Pclass']==2):
                predictions.append(1)
            else:
                predictions.append(0)    
        else:
            if passenger['Pclass']==1 or passenger['Pclass']==2:
                predictions.append(1)
            elif passenger['Pclass']==1 and passenger['Age']<15:
                predictions.append(0)
            elif passenger['Pclass']==3 and passenger['Age'] < 2:
                predictions.append(1)
            else:
                predictions.append(0)
    # Return our predictions
    return pd.Series(predictions)

# Make the predictions
predictions = predictions_3(data)
print accuracy_score(outcomes, predictions)

Predictions have an accuracy of 81.23%.


In [10]:
def predictions_4(data):
    
    predictions = []
    for _,passenger in data.iterrows():
        # Additional rules for Fare inferred from visualization
        if passenger['Sex'] == "male":
            if passenger['Fare'] > 153:
                predictions.append(1)
            elif (passenger['Age'] < 38) and (passenger['Pclass']==1) :
                predictions.append(1)
            elif (passenger['Age'] < 18) and (passenger['Pclass']==2):
                predictions.append(1)
            else:
                predictions.append(0)    
        else:
            if passenger['Fare'] > 151:
                predictions.append(1)
            elif passenger['Pclass']==1 or passenger['Pclass']==2:
                predictions.append(1)
            elif passenger['Pclass']==1 and passenger['Age']<15:
                predictions.append(0)
            elif passenger['Pclass']==3 and passenger['Age'] < 2:
                predictions.append(1)
            else:
                predictions.append(0)
    # Return our predictions
    return pd.Series(predictions)

# Make the predictions
predictions = predictions_3(data)
print accuracy_score(outcomes, predictions)

Predictions have an accuracy of 81.23%.
