# 타이타닉 생존자 예측

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [3]:
# class, who, adult_male, embark_town, alive, alone 제외

### 1. 데이터 전처리

- Feature selection

In [4]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'deck']]
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,


- 결측치 처리

In [6]:
# 결측치 확인
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [8]:
# age 컬럼은 평균으로 대체
df.age.fillna(df.age.mean(), inplace = True)
df.age.isna().sum()

0

In [10]:
# embarked 컬럼은 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [12]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [13]:
# deck 컬럼은 삭제
df.drop(columns=['deck'], inplace=True)

In [14]:
df.isna().sum().sum()

0

- 카테고리 값인 sex, embarked 컬럼은 숫자로 변환

In [15]:
# LabelEncoder로 변환 - data값을 숫자로 변환
from sklearn.preprocessing import LabelEncoder # sklearn - estimator, LabelEncoder(변환)
le = LabelEncoder()

In [16]:
df.sex = le.fit_transform(df.sex)               # LabelEncoder - 알파벳 순서대로 0부터 숫자 부여    ex. sex:female(0), male(1) 
df.embarked = le.fit_transform(df.embarked)     
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2


### 2. Train / Test dataset으로 분리

In [17]:
# X와 y를 넘파이 배열로
X = df.iloc[:, 1:].values               # Series를 numpy 배열로 바꾸기: .values
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [18]:
# y값의 분포
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [20]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([439, 273], dtype=int64))

In [22]:
print(342*439, 549*273)     # y값의 분포 == np.unique(y_train, return_counts=True)

150138 149877


### 3. RandomForest 모델로 학습

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2021)
rfc.get_params()        # 'n_estimators': 100               

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [25]:
# RFC로 학습
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2021)

### 4. 모델 예측 및 평가


In [26]:
rfc.score(X_test, y_test)

0.8100558659217877