In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [41]:
df = pd.read_csv('titanic.csv')

In [42]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### check the number of null values in each columns 


In [43]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### drop the deck column as it contains many nan values


In [44]:
df.drop('deck', axis = 'columns', inplace = True) 

In [45]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


# perform binning on fare column

In [47]:
bins = [0,50,100,200,300,500]
df.fare = pd.cut(df.fare, bins = bins, labels = ['very low','low','medium','high','very high'])

In [48]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,very low,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,low,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,very low,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,low,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,very low,S,Third,man,True,Southampton,no,True


# fill the nan value of the age column with the mean age 

In [132]:
mean = df.age.mean()
mean

29.69911764705882

In [135]:
df.age = df.age.fillna(value = mean)

# create new column named age_group with three group (young adult old )
young <= 20 
adult > 20 <= 50
old > 50

In [136]:
labels = ['young', 'adult', 'old']
bins = [0,20,50,100]
df['age_group'] = pd.cut(df.age, bins = bins , labels = labels)

In [138]:
df.groupby('age_group').age_group.count()

age_group
young    179
adult    648
old       64
Name: age_group, dtype: int64

## use apply function to adult_male column to change it to numerical column

In [159]:
df.adult_male = df.adult_male.apply(lambda x: 1 if x == True else 0)


## convert alive column to numerical , 1 refer to yes and 0 refer to female

In [151]:
def convert(x):
    if x == 'no':
        return 0
    else:
        return 1


In [152]:
df.alive = df.alive.apply(convert)

## use map function to change the along column to categorical,
True = 1

False = 0

In [154]:
df.alone = df.alone.map({False:0,True:1})

# which age group people are likely to survive ?

In [147]:
series = df[df.survived == 1].groupby('age_group').survived.count() / df.groupby('age_group').survived.count()
series

age_group
young    0.458101
adult    0.367284
old      0.343750
Name: survived, dtype: float64

In [148]:
series.idxmax()

'young'

##  which age_group contain most female ?

In [150]:
df[df.sex == 'female'].groupby('age_group').age_group.count().sort_values(ascending = False).index[0]

'adult'

# which class has most number of male ?

In [118]:
df[df.sex == 'male'].groupby('class')['class'].count().sort_values(ascending = False).index[0]

'Third'

# which class least number of female ?

In [123]:
df[df.sex == 'female'].groupby('class')['class'].count().idxmin()

'Second'

# Which class people are likely to survive more ?

In [109]:
survived = df.survived == 1
survived_in_each_class = df[survived].groupby('class')['class'].count() 
total_people_in_each_class = df.groupby('class')['class'].count()
class_with_most_survival_rate = (survived_in_each_class / total_people_in_each_class).idxmax()
class_with_most_survival_rate

'First'

# what is the percentage of male and female

In [71]:
np.round( df.groupby('sex').sex.count() * 100 / df.shape[0],2)

sex
female    35.24
male      64.76
Name: sex, dtype: float64

## what is survival rate for male?

In [77]:
np.round( ( df[(df.sex == 'male') & (df.survived == 1)].shape[0] * 100 ) / df[df.sex == 'male'].shape[0], 2)

18.89

## what is survival rate for female?

In [80]:
np.round((df[(df.sex == 'female') & (df.survived == 1)].shape[0] * 100) / df[df.sex == 'female'].shape[0],2)

74.2

# display overall survival and death rate for each gender 

In [87]:
np.round((df.groupby(['sex','survived']).sex.count()  * 100 )/ df.groupby('sex').sex.count(), 2)

sex     survived
female  0           25.80
        1           74.20
male    0           81.11
        1           18.89
Name: sex, dtype: float64

In [94]:
df.groupby(['class', 'survived']).survived.count()

class   survived
First   0            80
        1           136
Second  0            97
        1            87
Third   0           372
        1           119
Name: survived, dtype: int64

# how many male and female passenger are there in each pclass

In [61]:
df.groupby(['sex','pclass']).sex.count()

sex     pclass
female  1          94
        2          76
        3         144
male    1         122
        2         108
        3         347
Name: sex, dtype: int64