# Analysis of Titanic dataset prior to D3 Visualisation 

In [1]:
import pandas as pd
import csv

## Load data from CSV

In [57]:
titanic_df=pd.read_csv('titanic_data.csv')

## Data Cleaning 
- We realise that there are redundant columns in the dataframe which may not be useful for analysis
- Store a copy of the data frame before cleaning it. The columns which we feel not necessary now might be required later.
- The Ticket column here is redundant, as it doesn't really serve the need for any analysis
- The Cabin column also seems redundant

In [58]:
titanic_backup = titanic_df.copy()             
del titanic_df['Ticket']
del titanic_df['Cabin']

def state(st):
    if st==1:
        return "Survived"
    else:
        return "Dead"
titanic_df['State']=titanic_df['Survived'].apply(state)  

- The survival rate clearly tells that much more then 50% of the passengers (i.e, 61.6%) died during the tragedy. 

In [19]:
# A function to group the data in the data frame upon the specified column
def group_my_data(group_by,data_frame):
    total_count_for_group = data_frame.groupby(group_by).count()['PassengerId']
    total_survival = data_frame.groupby(group_by).sum()['Survived']
    return (total_survival,total_count_for_group)

## 1. Survival Rate
- Survival rate is total number of survivals/total number of passengers expressed in percentage.

In [79]:
sur,dead=group_my_data('State',titanic_df)
a=dead[0]
b=dead[1]
dead[0]=a*100/(a+b)
dead[1]=b*100/(a+b)
survival=pd.concat([dead],axis=1,keys=['Percentage'])
survival

Unnamed: 0_level_0,Percentage
State,Unnamed: 1_level_1
Dead,61
Survived,38


In [80]:
survival.to_csv('survival.csv', sep=',')

## 2 : Effect of Class

In [12]:
class_sur, class_tot = group_my_data('Pclass',titanic_df)
survival_wrt_class=(class_sur/class_tot)*100
class_dead=class_tot-class_sur
survival_wrt_class=pd.concat([class_sur,class_dead,class_tot,survival_wrt_class],axis=1,keys=['Survival Count','Death Count','Total Count','Percentage'])
survival_wrt_class

Unnamed: 0_level_0,Survival Count,Death Count,Total Count,Percentage
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,136,80,216,62.962963
2,87,97,184,47.282609
3,119,372,491,24.236253


In [13]:
survival_wrt_class.to_csv('survival_wrt_class.csv', sep=',')

- The results clearly specify that most of the 1st class passengers were saved.
- That wasn't the case with 2nd and 3rd class. Less than 50% of them were saved.

## 3 : Effect of Sex

In [14]:
sex_sur, sex_tot = group_my_data('Sex',titanic_df)
survival_wrt_sex=(sex_sur/sex_tot)*100
sex_dead=sex_tot-sex_sur
survival_wrt_sex=pd.concat([sex_sur,sex_dead,sex_tot,survival_wrt_sex],axis=1,keys=['Survival Count','Death Count','Total Count','Percentage'])
survival_wrt_sex

Unnamed: 0_level_0,Survival Count,Death Count,Total Count,Percentage
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,233,81,314,74.203822
male,109,468,577,18.890815


In [15]:
survival_wrt_sex.to_csv('survival_wrt_sex.csv', sep=',')

- The results imply that majority of the women were saved (around 3/4th of the women passengers were saved)
- Only 18.9% of the male passengers were saved

## 4 : Combined effect of Sex and Class on survival

In [16]:
total_sex_class_count = titanic_df.groupby(['Sex','Pclass']).count()['PassengerId']
sex_class_survival = titanic_df.groupby(['Sex','Pclass']).sum()['Survived']
sex_class_survival_rate = (sex_class_survival/total_sex_class_count)*100
sex_class_dead=total_sex_class_count-sex_class_survival
sex_class_survival_rate=pd.concat([sex_class_survival,sex_class_dead,total_sex_class_count,sex_class_survival_rate],axis=1,keys=['Survival Count','Death Count','Total Count','Percentage'])
sex_class_survival_rate

Unnamed: 0_level_0,Unnamed: 1_level_0,Survival Count,Death Count,Total Count,Percentage
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,1,91,3,94,96.808511
female,2,70,6,76,92.105263
female,3,72,72,144,50.0
male,1,45,77,122,36.885246
male,2,17,91,108,15.740741
male,3,47,300,347,13.544669


In [17]:
sex_class_survival_rate.to_csv('survival_wrt_sex_class.csv', sep=',')

- 6 pie charts explain the survival scenarios of male and female from different classes of the society. 
- Female from 1st, 2nd and 3rd class were saved on a greater ratio with respect to males from the corresponding class. 
- But as the class went from 1st to 3rd the survival rate of male/female in that class also came down

# 5. Survival based on Age Group

In [18]:
tit_temp=titanic_df.copy()

In [19]:
def age_group(age):
    if age<=15:
        return '0-15'
    elif age<=30:
        return '16-30'
    elif age<=45:
        return '31-45'
    elif age<=60:
        return '46-60'
    elif age>60:
        return '60+'
    else:
        return 'NA'

In [20]:
tit_temp['Age']=tit_temp['Age'].apply(age_group)
tit_temp=tit_temp.rename(columns={'Age':'Age_Group'})
age_sur,age_tot=group_my_data('Age_Group',tit_temp)
survival_wrt_age=(age_sur/age_tot)*100
age_dead=age_tot-age_sur
survival_wrt_age=pd.concat([age_sur,age_dead,age_tot,survival_wrt_age],axis=1,keys=['Survival Count','Death Count','Total Count','Percentage'])
survival_wrt_age

Unnamed: 0_level_0,Survival Count,Death Count,Total Count,Percentage
Age_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-15,49,34,83,59.036145
16-30,117,209,326,35.889571
31-45,86,116,202,42.574257
46-60,33,48,81,40.740741
60+,5,17,22,22.727273
,52,125,177,29.378531


In [21]:
survival_wrt_age.to_csv('survival_wrt_age.csv', sep=',')

## Question 6 : Women and Children Survival

In [22]:
tit_temp['Women_Child_OR_Male']= 'Male'

In [23]:
for i in range(0,len(tit_temp)):
    if tit_temp.loc[i,'Sex']=='female' or tit_temp.loc[i,'Age_Group']=='Child':
        tit_temp.loc[i,'Women_Child_OR_Male']='Women_Child'

In [24]:
chi_fem_sur, chi_fem_tot=group_my_data('Women_Child_OR_Male',tit_temp)
survival_female_child=(chi_fem_sur/chi_fem_tot)*100
chi_fem_dead=chi_fem_tot-chi_fem_sur
survival_female_child=pd.concat([chi_fem_sur,chi_fem_dead,chi_fem_tot,survival_female_child],axis=1,keys=['Survival Count','Death Count','Total Count','Percentage'])

In [25]:
survival_female_child

Unnamed: 0_level_0,Survival Count,Death Count,Total Count,Percentage
Women_Child_OR_Male,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,109,468,577,18.890815
Women_Child,233,81,314,74.203822


In [26]:
survival_female_child.to_csv('survival_female_child.csv', sep=',')

- It is important that during any accident Women and Child first protocol is followed. 
- It was followed to a very great extent in titanic as 68.8% of the women/children were saved during the accident 
- This value is very high when comapred to only 16.6% of males (who aren't children)

## 7. Effect of Family Size

In [34]:
tit_temp['Family']=tit_temp['SibSp']+tit_temp['Parch']

In [35]:
family_sur,family_tot = group_my_data('Family',tit_temp)
survival_wrt_family_size=(family_sur/family_tot)*100
family_dead=family_tot-family_sur
survival_wrt_family_size=pd.concat([family_sur,family_dead,family_tot,survival_wrt_family_size],axis=1,keys=['Survival Count','Death Count','Total Count','Percentage'])

In [36]:
survival_wrt_family_size

Unnamed: 0_level_0,Survival Count,Death Count,Total Count,Percentage
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,163,374,537,30.353818
1,89,72,161,55.279503
2,59,43,102,57.843137
3,21,8,29,72.413793
4,3,12,15,20.0
5,3,19,22,13.636364
6,4,8,12,33.333333
7,0,6,6,0.0
10,0,7,7,0.0


In [37]:
survival_wrt_family_size.to_csv('survival_wrt_family_size.csv', sep=',')