## [ 타이타닉 생존자 보고서 분석 ]
- 등급, 연령, 나이에 따른 생존 확률 분석

In [135]:
# 파일 준비
import pandas as pd
file = '../DATA/titanic.csv'
titanicDF = pd.read_csv(file)
titanicDF

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


<hr>

##### 전처리

In [136]:
# 기본 정보 확인
titanicDF.info()   # survived -> bool, sex -> category

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [137]:
# 데이터 타입 변경
# 1) 생존자 -> bool
titanicDF.survived = titanicDF.survived.astype(bool)
# 2) 성별 -> category
titanicDF.sex = titanicDF.sex.astype('category')
titanicDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    bool    
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    category
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    object  
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    object  
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(3), category(1), float64(2), int64(3), object(6)
memory usage: 80.3+ KB


In [138]:
# 중복값 제외 : survived, pclass, sex, age, sibsp, parch, fare, embark, adult_male(성인 남성)
titanicDF = pd.read_csv(file, usecols=["survived", "pclass", "sex", "age", "sibsp", "parch", "fare", "embarked", "adult_male"])
titanicDF

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male
0,0,3,male,22.0,1,0,7.2500,S,True
1,1,1,female,38.0,1,0,71.2833,C,False
2,1,3,female,26.0,0,0,7.9250,S,False
3,1,1,female,35.0,1,0,53.1000,S,False
4,0,3,male,35.0,0,0,8.0500,S,True
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,True
887,1,1,female,19.0,0,0,30.0000,S,False
888,0,3,female,,1,2,23.4500,S,False
889,1,1,male,26.0,0,0,30.0000,C,True


In [139]:
# 결측치 판단 / 제거
titanicDF.isnull().sum()  # 'deck' 항목의 null이 많음

# - 결측치가 절반 이상이면 고려하지 않음
titanicDF.dropna(thresh=(len(titanicDF.index)/2), axis='columns')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male
0,0,3,male,22.0,1,0,7.2500,S,True
1,1,1,female,38.0,1,0,71.2833,C,False
2,1,3,female,26.0,0,0,7.9250,S,False
3,1,1,female,35.0,1,0,53.1000,S,False
4,0,3,male,35.0,0,0,8.0500,S,True
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,True
887,1,1,female,19.0,0,0,30.0000,S,False
888,0,3,female,,1,2,23.4500,S,False
889,1,1,male,26.0,0,0,30.0000,C,True


<hr>

데이터 분석

In [140]:
# 특수 열 분석 : adult_male (성인 남성)
# -----------------------------------------------------------------
# 전체 생존률
surTotalSum = ((titanicDF.survived) & (titanicDF.shape[0])).sum()
totalSum = titanicDF.shape[0]
surRateTotal = round(surTotalSum / totalSum * 100)
# -----------------------------------------------------------------
# 전체 성인 남성의 수
adMaleSum = titanicDF.adult_male.sum()  # =537, 전체 891명 중 절반 이상
# 생존한 성인 남성의 수
surMaleSum = ((titanicDF.adult_male) & (titanicDF.survived)).sum() # =88
# 해당 비율
surRateAM = round(surMaleSum / adMaleSum * 100)
# -----------------------------------------------------------------

In [141]:
# -----------------------------------------------------------------
# 남자 아이의 수
#  - 성인의 기준 판단
titanicDF[ titanicDF.adult_male ].sort_values(by='age', na_position='last').head()  # 나이 최소값 : 16
titanicDF[(titanicDF.age <= 16) & (titanicDF.adult_male) ]  # 확인 결과 16.0 이상부터 성인
# 남자 아이 생존 비율
childMaleSum = ((titanicDF.age < 16) & (titanicDF.sex == 'male' )).sum()
surChildMaleSum = ((titanicDF.age < 16) & (titanicDF.sex == 'male' ) & titanicDF.survived).sum()
surRateChild = round(surChildMaleSum / childMaleSum *100)
# -----------------------------------------------------------------

In [142]:
# -----------------------------------------------------------------
# 여성의 생존 비율 구하기
FemaleSum = (titanicDF.sex == 'female').sum()
surFemaleSum = ((titanicDF.survived) & (titanicDF.sex == 'female')).sum()
surRateF = round(surFemaleSum / FemaleSum *100)
# ------------------------------------------------------------------

In [145]:
# 성별 별 생존률
surSexDF = pd.DataFrame(data=(
    [adMaleSum, childMaleSum, FemaleSum, totalSum], [surMaleSum, surChildMaleSum, surFemaleSum, surTotalSum], [surRateAM, surRateChild, surRateF, surRateTotal]), columns=['Adult Male', 'Child Male', 'Female', 'Total'], index=['전체 인원', '생존 인원', '생존 비율(%)']
    )
surSexDF

Unnamed: 0,Adult Male,Child Male,Female,Total
전체 인원,537,40,314,891
생존 인원,88,21,233,342
생존 비율(%),16,52,74,38


In [144]:
# 이렇게 mean() 값을 쓸 수도 있었습니다...
(titanicDF.survived).mean()

0.3838383838383838