In [None]:
# Useful imports
import numpy as np
import pandas as pd

In [None]:
# Read training data from file
df = pd.read_csv('./datasets/train.csv', header=0)

df.info()

In [None]:
df

In [None]:
# Get statistics of survived people based on gender
number_passengers = len(df['PassengerId'])
number_survived = np.sum(df['Survived'].astype(np.float))
proportion_survivors = number_survived / number_passengers

women_only_stats = np.sum(df[ df['Sex'] == 'female' ]['Survived'].astype(np.float))
men_only_stats = np.sum(df[ df['Sex'] != 'female' ]['Survived'].astype(np.float))

women_onboard = len(df[ df['Sex'] == 'female' ])
men_onboard = len(df[ df['Sex'] != 'female' ])

proportion_women_survived = women_only_stats / women_onboard
proportion_men_survived = men_only_stats / men_onboard

print 'Passengers: %d' % number_passengers
print 'Survived: %d' % number_survived
print 'Proportion: %f' % proportion_survivors
print '---'
print 'Women onboard: %d' % women_onboard
print 'Men onboard: %d' % men_onboard
print 'Women survived: %d' % women_only_stats
print 'Men survived: %d' % men_only_stats
print 'Women proportion: %f' % proportion_women_survived
print 'Men proportion: %f' % proportion_men_survived

In [None]:
# Read test data from file
df_test = pd.read_csv('./datasets/test.csv', header=0)
df_test.info()

In [None]:
# Use the simplest gender model to predict who will survive (women survive, men don't)
df_model0 = pd.DataFrame(columns=['PassengerId', 'Survived'])

df_model0['PassengerId'] = df_test['PassengerId']
df_model0['Survived'] = df_test['Sex'].map({'female': 1, 'male': 0}).astype(int)

# Write results to file
df_model0.to_csv('./results/genderbasedmodel-pandas.csv', index=False)

In [None]:
# Prepare to train model based on gender, class and fare
fare_ceiling = 40

df['LimitedFare'] = df['Fare']
df.loc[df['LimitedFare'] >= fare_ceiling,'LimitedFare'] = fare_ceiling - 1.0

fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size

number_of_classes = len(np.unique(df['Pclass']))

survival_table = np.zeros((2, number_of_classes, number_of_price_brackets))

In [None]:
# Train (fill the survival_table)
for i in xrange(number_of_classes):
    for j in xrange(number_of_price_brackets):
        women_only_stats = df.loc[\
            (df['Sex'] == 'female')\
            & (df['Pclass'] == i+1)\
            & (df['LimitedFare'] >= j*fare_bracket_size)\
            & (df['LimitedFare'] < (j+1)*fare_bracket_size)\
        , 'Survived']
        
        men_only_stats = df.loc[\
            (df['Sex'] != 'female')\
            & (df['Pclass'] == i+1)\
            & (df['LimitedFare'] >= j*fare_bracket_size)\
            & (df['LimitedFare'] < (j+1)*fare_bracket_size)\
        , 'Survived']
        
        survival_table[0,i,j] = women_only_stats.astype(np.float).mean()
        survival_table[1,i,j] = men_only_stats.astype(np.float).mean()
        
survival_table[survival_table != survival_table] = 0

# Set probabilities of survival to 0 or 1
survival_table[survival_table >= 0.5] = 1
survival_table[survival_table < 0.5] = 0                               

In [None]:
# Set gender as 0 or 1 according to sex
df_test['Gender'] = df_test['Sex'].map( {'female': 0, 'male': 1} ) 

# Set fare bin for each test record (from 0 to number_of_price_brackets-1)
df_test.loc[df_test['Fare'].isnull(), 'FareBin'] = 3 - df_test.loc[ df_test['Fare'].isnull(), 'Pclass']
df_test.loc[df_test['Fare'].notnull() & (df_test['Fare'] >= fare_ceiling), 'FareBin'] = number_of_price_brackets-1
for i in xrange(number_of_price_brackets):
    df_test.loc[\
        (df_test['Fare'].notnull())\
        & (df_test['Fare'] >= i*fare_bracket_size)\
        & (df_test['Fare'] < (i+1)*fare_bracket_size), 'FareBin'\
    ] = i

df_test['FareBin'] = df_test['FareBin'].astype(int)   

df_test.info()

# Use gender, class and fare bin to predict who will survive
df_model1 = pd.DataFrame(columns=['PassengerId', 'Survived'])
df_model1['PassengerId'] = df_test['PassengerId']
df_model1['Survived'] = survival_table[ df_test['Gender'], df_test['Pclass']-1, df_test['FareBin'] ].astype(int)

# Write results to file
df_model1.to_csv('./results/genderclassmodel-pandas.csv', index=False)