In [188]:
import numpy as np

In [189]:
import pandas as pd

In [190]:
from pomegranate import *

In [191]:
def p_survived(list_x):
    return (list_x[(list_x.Survived == 1)].count() + 1 ) / (list_x.count() + 2)

In [192]:
passenger = DiscreteDistribution( { 'survive': 0.5, 'perish': 0.5 } )

In [193]:
train_data = pd.read_csv('../data/titanic/train.csv', header=0)

In [194]:
gen_male = train_data[train_data.Sex == 'male']

In [195]:
gen_female = train_data[train_data.Sex == 'female']

In [196]:
p_gen_male = p_survived(gen_male)['Sex']

In [197]:
p_gen_female = p_survived(gen_female)['Sex']

In [198]:
len(gen_male), len(gen_female)

(449, 251)

In [199]:
gender = ConditionalProbabilityTable(
            [[ 'survive', 'male',   p_gen_male ],
             [ 'survive', 'female', p_gen_female ],
             [ 'perish', 'male',    1 - p_gen_male ],
	         [ 'perish', 'female',  1 - p_gen_female]], [passenger] )


In [200]:
tclass_1 = train_data[train_data.Pclass == 1]

In [201]:
tclass_2 = train_data[train_data.Pclass == 2]

In [202]:
tclass_3 = train_data[train_data.Pclass == 3]

In [203]:
p_tclass_1 = p_survived(tclass_1)['Pclass']

In [204]:
p_tclass_2 = p_survived(tclass_2)['Pclass']

In [205]:
p_tclass_3 = p_survived(tclass_3)['Pclass']

In [206]:
print p_tclass_1, p_tclass_2, p_tclass_3

0.602339181287 0.489795918367 0.255154639175


In [207]:
train_data.Pclass.median() #So use third class for NaN

3.0

In [208]:
tclass = ConditionalProbabilityTable(
            [[ 'survive', 'first',  p_tclass_1 ],
             [ 'survive', 'second', p_tclass_2 ],
             [ 'survive', 'third',  p_tclass_3 ],
             [ 'perish', 'first',  1 - p_tclass_1 ],
             [ 'perish', 'second', 1 - p_tclass_2 ],
	         [ 'perish', 'third',  1 - p_tclass_3]], [passenger] )


In [209]:
age_1 = train_data[train_data.Age < 20]

In [210]:
age_2 = train_data[(train_data.Age >= 20) & (train_data.Age < 40)]

In [211]:
age_3 = train_data[(train_data.Age >= 40) & (train_data.Age < 60)]

In [212]:
age_4 = train_data[(train_data.Age >= 60) & (train_data.Age < 80)]

In [213]:
age_5 = train_data[(train_data.Age >= 80)]

In [214]:
p_age_1 = p_survived(age_1)['Age']

In [215]:
p_age_2 = p_survived(age_2)['Age']

In [216]:
p_age_3 = p_survived(age_3)['Age']

In [217]:
p_age_4 = p_survived(age_4)['Age']

In [218]:
p_age_5 = p_survived(age_5)['Age']

In [219]:
train_data.Age.median() #so use age_2 for NaN

28.0

In [220]:
age = ConditionalProbabilityTable(
            [[ 'survive', 'age_1',   p_age_1 ],
             [ 'survive', 'age_2', p_age_2 ],
             [ 'survive', 'age_3', p_age_3 ],
             [ 'survive', 'age_4', p_age_4 ],
             [ 'survive', 'age_5', p_age_5 ],             
             [ 'perish', 'age_1',    1 - p_age_1 ],
             [ 'perish', 'age_2',    1 - p_age_2 ],
             [ 'perish', 'age_3',    1 - p_age_3 ],
             [ 'perish', 'age_4',    1 - p_age_4 ],
	         [ 'perish', 'age_5',  1 - p_age_5]], [passenger] )

In [221]:
s1 = State( passenger, name = "passenger" )

In [222]:
s2 = State( gender, name = "gender" )

In [223]:
s3 = State( tclass, name = "class" )

In [224]:
s4 = State( age, name = "age" )

In [225]:
network = BayesianNetwork( "Titanic Disaster" )

In [226]:
network.add_nodes( [ s1, s2, s3, s4 ] )

In [227]:
network.add_edge( s1, s2 )

In [228]:
network.add_edge( s1, s3 )

In [229]:
network.add_edge( s1, s4 )

In [230]:
network.bake()

In [245]:
train_test_data = pd.read_csv('../data/titanic/train_verify.csv', header=0)

In [246]:
train_test_res = []
for row in train_test_data.values:
    observations = {'gender': row[4], 'age': find_age(row[5]), 'class': find_class(row[2]) }
    beliefs = network.forward_backward( observations )
    res = beliefs[0].parameters[0]
    if res['survive'] > res['perish']:
        train_test_res.append(1)
    else:
        train_test_res.append(0)

{'gender': 'female', 'age': 'age_1', 'class': 'first'}
{'gender': 'male', 'age': 'age_2', 'class': 'first'}
{'gender': 'female', 'age': 'age_1', 'class': 'third'}
{'gender': 'male', 'age': 'age_2', 'class': 'third'}
{'gender': 'male', 'age': 'age_2', 'class': 'third'}
{'gender': 'male', 'age': 'age_2', 'class': 'second'}
{'gender': 'female', 'age': 'age_3', 'class': 'second'}
{'gender': 'male', 'age': 'age_3', 'class': 'first'}
{'gender': 'female', 'age': 'age_2', 'class': 'first'}
{'gender': 'male', 'age': 'age_2', 'class': 'third'}
{'gender': 'female', 'age': 'age_2', 'class': 'first'}
{'gender': 'male', 'age': 'age_2', 'class': 'first'}
{'gender': 'male', 'age': 'age_3', 'class': 'first'}
{'gender': 'male', 'age': 'age_2', 'class': 'third'}
{'gender': 'male', 'age': 'age_3', 'class': 'second'}
{'gender': 'male', 'age': 'age_1', 'class': 'third'}
{'gender': 'female', 'age': 'age_2', 'class': 'first'}
{'gender': 'female', 'age': 'age_2', 'class': 'second'}
{'gender': 'male', 'age': 'a

In [247]:
prediction = zip(train_test_data.Survived, train_test_res)

In [248]:
match_count = 0
for row in prediction:
    if row[0] == row[1]:
        match_count += 1

In [250]:
print "Ratio of correctly predicted rows to total rows for 30% training data", match_count / float(len(train_test_data.PassengerId))

Ratio of correctly predicted rows to total rows for 30% training data 0.811518324607


In [244]:
test_data = pd.read_csv('../data/titanic/test.csv', header=0)

In [237]:
def find_age(age):
    if age < 20:
        return 'age_1'
    elif age >= 20 and age < 40:
        return 'age_2'
    elif age >= 40 and age < 60:
        return 'age_3'
    elif age >= 60 and age < 80:
        return 'age_4'
    elif age >= 80:
        return 'age_5'
    else:
        return 'age_2'

In [238]:
def find_class(pclass):
    if pclass == 1:
        return 'first'
    elif pclass == 2:
        return 'second'
    elif pclass == 3:
        return 'third'
    else:
        return 'third'

In [239]:
test_res = []
for row in test_data.values:
    observations = {'gender': row[3], 'age': find_age(row[4]), 'class': find_class(row[1]) }
    beliefs = network.forward_backward( observations )
    res = beliefs[0].parameters[0]
    if res['survive'] > res['perish']:
        test_res.append(1)
    else:
        test_res.append(0)


In [242]:
index = ['PassgengerId', 'Survived']
d = {'PassengerId':test_data.PassengerId, 'Survived': test_res}
df = pd.DataFrame(data = d, columns=['PassengerId', 'Survived'])

In [243]:
df.to_csv('../data/titanic/output_final.csv', index=False)