### 타이타닉 생존자 예측

In [52]:
import numpy as np
import pandas as pd
import seaborn as sns

In [53]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


##### 1. 데이터 전처리

- Feature selection: 중복 필드 제외

In [54]:
# 중복제거 
# class       pclass
# embark_town     embarked
# alive       survived
# alone       sibsp
# adult_male

In [55]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','who','deck']]
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


- 결측치 처리

In [56]:
# 결측치 확인
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [57]:
# age는 평균으로 대체
# 단, who column이 결측치가 없으므로 어른은 어른나이 평균, 아동은 아동나이 평균으로 대체
adult = df[df.who.isin(['man','woman'])]  # who에 man,이랑 woman있는것만
child = df[df.who == 'child']
adult.shape, child.shape

((808, 10), (83, 10))

In [58]:
# 아동은 결측치가 없음
adult.age.isna().sum(), child.age.isna().sum()

(177, 0)

In [59]:
# 어른들의 나이로만 age의 결측치를 평균으로 대체 
df.age.fillna(adult.age.mean().round(1), inplace=True)
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,32.8,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


In [60]:
# 아래와 같이 할 수도 있음
df[df.who.isin(['man','woman'])].age.fillna(df[df.who.isin(['man','woman'])].age.mean().round(1), inplace=True)
df[df.who == 'child'].age.fillna(df[df.who == 'child'].age.mean().round(1), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.who.isin(['man','woman'])].age.fillna(df[df.who.isin(['man','woman'])].age.mean().round(1), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.who == 'child'].age.fillna(df[df.who == 'child'].age.mean().round(1), inplace=True)


In [61]:
# embarked 결측치를 최빈값으로 대체
df.embarked.value_counts()
# S가 빈도수가 높아 결측치를 S로대체예정

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [62]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [63]:
# deck 컬럼 삭제(결측치가 너무 많아서), who 컬럼도 삭제(age와 중복)
df.drop(columns=['deck','who'], inplace=True)

In [64]:
df.isna().sum().sum()

0

- 카테고리형 데이터를 숫자로 변환

In [65]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [66]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


##### 2. 훈련/테스트 데이터로 분리

In [67]:
# data, 정답
X = df.iloc[:, 1:].values
y = df.survived.values

In [68]:
# train,test 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)

##### 3. Random Forest로 학습

In [69]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023) # 'n_estimators': 100개를사용? / 중복을 허용한 무작위로100개를 추출?ㅋㅋㅋㅋ
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [70]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.7821229050279329

##### 4. GridSearchCV로 수행

In [71]:
params = {
    'max_depth': [2, 5, 8],
    'min_samples_split': [2, 3, 4]
}

In [72]:
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [73]:
grid_rf.best_params_

{'max_depth': 8, 'min_samples_split': 2}

In [74]:
params = {
    'max_depth': [6,7,8,9,10],
    'min_samples_split': [2, 3, 4]
}
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [75]:
grid_rf.best_params_

{'max_depth': 9, 'min_samples_split': 3}

In [76]:
grid_rf.best_score_

0.8299911356249383

In [77]:
best_rfc = grid_rf.best_estimator_
best_rfc.score(X_test, y_test)

0.8100558659217877

##### 4. 테스트 데이터 하나에 대해서 적용

In [78]:
test_data, pred = X_test[10], y_test[10]
test_data, pred

(array([ 2.    ,  1.    , 32.5   ,  1.    ,  0.    , 30.0708,  0.    ]), 0)

In [79]:
result = best_rfc.predict(test_data.reshape(1, -1))[0]
result

0

##### 5. Logistic Regression 모델

- 표준화

In [80]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)

In [82]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.7486033519553073

- 정규화

In [83]:
from sklearn.preprocessing import MinMaxScaler
Xmm = MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    Xmm, y, stratify=y, test_size=0.2, random_state=2023
)
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.770949720670391

In [84]:
# RFC 0.7821
# Grid 0.8100
# LRC(std) 0.7486 std:standard 표준화
# LRC(mm) 0.7709  mm: min,max 정규화

##### 6. 엉터리 분류기
- 여성이면 생존이라 예측, 그 외는 사망

In [85]:
df.pivot_table('survived', 'sex')

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
0,0.742038
1,0.188908


In [86]:
X[:3]

array([[ 3.    ,  1.    , 22.    ,  1.    ,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    ,  1.    ,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    ,  0.    ,  0.    ,  7.925 ,  2.    ]])

In [87]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [88]:
from sklearn.base import BaseEstimator
# 2번째필드가 성별
# BaseEstimator를 상속해서 MyClassifier 클래스를 만들고
# fit(), predict() method를 재정의(override)
class MyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros(X.shape[0], int)
        for i in range(X.shape[0]):
            if X[i, 1] == 0.:           # 0 은 여성이면
                pred[i] = 1             # 생존으로 지정 # 성별필드가 여성이면 예측치를 생존으로 바꿔라
        return pred

In [89]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train)
pred_my = my_clf.predict(X_test)

In [90]:
y_test[:5], pred_my[:5]

(array([0, 1, 0, 0, 0], dtype=int64), array([0, 1, 0, 0, 0]))

In [91]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.7653631284916201

##### 7. 오차 행렬(Confusion matrix)

In [92]:
pred = best_rfc.predict(X_test)

In [93]:
from sklearn.metrics import confusion_matrix

In [94]:
# Best RFC로 예측한 값의 오차 행렬
confusion_matrix(y_test, pred)

array([[96, 14],
       [20, 49]], dtype=int64)

In [95]:
# 엉터리 분류기로 예측한 값의 오차 행렬
confusion_matrix(y_test, pred_my)

array([[88, 22],
       [20, 49]], dtype=int64)

In [96]:
# 정확도(accuracy) 비교 
accuracy_score(y_test, pred), accuracy_score(y_test, pred_my)

(0.8100558659217877, 0.7653631284916201)

In [97]:
# 정밀도, 재현율
from sklearn.metrics import precision_score, recall_score

In [98]:
# 정밀도(precision) 비교
precision_score(y_test, pred), precision_score(y_test, pred_my)

(0.7777777777777778, 0.6901408450704225)

In [99]:
# 재현율(recall) 비교
recall_score(y_test, pred), recall_score(y_test, pred_my)

(0.7101449275362319, 0.7101449275362319)

In [100]:
# F1 score 비교
from sklearn.metrics import f1_score
f1_score(y_test, pred), f1_score(y_test, pred_my)

(0.7424242424242424, 0.7)

In [101]:
# AUC(Area under ROC curve) 비교
from sklearn.metrics import roc_auc_score 
roc_auc_score(y_test, pred), roc_auc_score(y_test, pred_my)

(0.7914361001317525, 0.755072463768116)