# `titanic.ipynb`
## 생존자

In [None]:
# -q (quiet) 옵션은 설치 로그를 안보여줌(조용히함)
%pip install -q seaborn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
# 타이타닉 데이터셋 불러오기
# (pandas에 기본 내장된 예제 데이터셋)
import seaborn as sns
import numpy as np
import pandas as pd

# titanic 데이터 DataFrame
titanic = sns.load_dataset('titanic')

# 처음 5개 행 보기
print("타이타닉 데이터 미리보기:")
print(titanic.head())

# 데이터셋 정보 확인
print("\n데이터셋 정보:")
print(titanic.info())

# 데이터 요약 통계
print("\n데이터 요약 통계:")
print(titanic.describe())

타이타닉 데이터 미리보기:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

데이터셋 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0 

In [None]:
# col별 결측치 합계
print(titanic.isna().sum())

# 승객 등급 등장 빈도 (1, 2, 3 등석)
print(titanic['pclass'].value_counts().sort_index())

# 성별 빈도 (남/녀)
print(titanic['sex'].value_counts())

# 생존 여부 빈도 (생존자/사망자)
print(titanic['survived'].value_counts())


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
pclass
1    216
2    184
3    491
Name: count, dtype: int64
sex
male      577
female    314
Name: count, dtype: int64
survived
0    549
1    342
Name: count, dtype: int64


In [None]:
# 성별에 따른 생존율
titanic.groupby('sex')['survived'].mean()
# 더 복잡한 통계 가능
titanic.groupby('sex').agg({'survived': ['mean']})

# 승객 등급에 따른 생존율
titanic.groupby('pclass')['survived'].mean()


sex
female    74.203822
male      18.890815
Name: survived, dtype: float64
        survived
            mean
sex             
female  0.742038
male    0.188908
pclass
1    62.962963
2    47.282609
3    24.236253
Name: survived, dtype: float64


In [None]:
# 성별 & 승객 등급에 따른 생존
titanic.groupby(['sex', 'pclass'])['survived'].mean()
# 피벗테이블
titanic.pivot_table(
    values='survived',
    index='sex',
    columns='pclass',
    aggfunc='mean'
)

# 나이 그룹별 생존율
titanic['age_group'] = pd.cut(
    titanic['age'],
    bins=[0, 12, 18, 35, 60, 100, 200],  # 초과 ~ 이하
    labels=['아동', '청소년', '청년', '중장년', '노년', '불사']
)

titanic.head()

#  obeserved 옵션 == 카테고리는 있는데, 해당되는 데이터가 없을 때 표시한다 / 안한다. 
titanic.groupby('age_group', observed=False)['survived'].mean()

# 성별 + 나이그룹으로 생존율 확인
titanic.groupby(['sex', 'age_group'])['survived'].mean()
# 피벗테이블
titanic.pivot_table(
    values='survived',
    index='sex',
    columns='age_group',
    aggfunc='mean'
)

sex     pclass
female  1         96.808511
        2         92.105263
        3         50.000000
male    1         36.885246
        2         15.740741
        3         13.544669
Name: survived, dtype: float64
age
0.42     100.0
0.67     100.0
0.75     100.0
0.83     100.0
0.92     100.0
         ...  
70.00      0.0
70.50      0.0
71.00      0.0
74.00      0.0
80.00    100.0
Name: survived, Length: 88, dtype: float64


In [25]:
titanic = sns.load_dataset('titanic')

# 결측치 확인
print(titanic.isna().sum())

missing = titanic.isna().sum()

# 결측 있는 항목만 확인
missing[missing > 0]

# 결측 비율
missing_p = titanic.isna().mean() * 100
missing_p[missing_p > 0]

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


age            19.865320
embarked        0.224467
deck           77.216611
embark_town     0.224467
dtype: float64

In [None]:
# 결측치 채우기
# age: 중요한 정보 : 평균/중앙값 대체
# embarked: 가장 많은 사람들이 탄 곳으로 대체
# deck: 결측치가 지배적이라서 데이터에서 의미를 찾아내기 힘듦 -> 삭제

# 필요하다면, 카피떠서 진행
# 카피코드:  titanic_processed = titanic.copy()
titanic_processed = titanic.copy()

# 비어있던 행 마스킹
mask = titanic['age'].isna()

# 결측치 채우기 (남녀상관없이 전체평균으로 채움)
mean_age = titanic['age'].mean()
titanic_processed['age'] = titanic['age'].fillna(mean_age)

titanic_processed

# 결측치 있는지 재확인
titanic_processed.isna().sum()

# 평균으로 채워진 값들만 확인
titanic_processed.loc[mask]


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,29.699118,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,29.699118,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,29.699118,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,29.699118,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,29.699118,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,29.699118,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,29.699118,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,29.699118,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [33]:
# 전체평균으로 대체하는게 옳을까..?
# 성별/객실별로 평균을 추정하는건?

# 모든 사람들을 성별/객실 그룹의 평균 나이로 바꾼 Series
mean_ages = titanic.groupby(['sex', 'pclass'])['age'].transform('mean')

# titanic 의 age col 중 빈 값만, mean_ages로 채움.
titanic_processed['age'] = titanic['age'].fillna(mean_ages)

# 원래 비어있던 애들만 확인 
titanic_processed.loc[ titanic['age'].isna() ]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,26.507589,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,30.740707,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,21.750000,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,26.507589,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,21.750000,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,26.507589,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,21.750000,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,26.507589,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,26.507589,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [None]:
# embarked (탑승 도시) -> 최빈값(가장 많이 탄 곳)

# 비어있는 항구 개수
titanic['embarked'].isna().sum()

# 최빈값 (Series - 벡터)
mode_embarked = titanic['embarked'].mode()[0]  # Series 에서 1개 뽑기
# 빈 값은 최빈값으로 교체
titanic_processed['embarked'] = titanic['embarked'].fillna(mode_embarked)

# 처리 완료 확인 (빈값 0개)
titanic_processed['embarked'].isna().sum()

np.int64(2)

In [None]:
# deck 은 77%가 비어있음 -> 삭제
titanic['deck'].isna().mean() * 100  # 빈값 비율

# 'deck' 컬럼은 삭제  - inplace 는 실행하면, 2번째 실행 불가능
titanic_processed.drop('deck', axis=1, inplace=True)

KeyError: "['deck'] not found in axis"