In [71]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
!pip install bayesian-optimization

## 데이터 불러오기

In [17]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 데이터셋 만들기

In [20]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
x_train = train_data[features]
y_train = train_data["Survived"]
x_test = test_data[features]

In [21]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

x_train['Sex'] = x_train['Sex'].map( {'female': 1, 'male': 0} )
x_test['Sex'] = x_test['Sex'].map( {'female': 1, 'male': 0} )

## hyper parameter tunning

#### GridSearchCV

``` python
GridSearchCV( 
    model,              # estimator
    param_grid=,        # 찾고자하는 파라미터. dict형식
    cv= 4 or KFold(4),  # 교차 검증
    scoring=None,       #  Classification : 'accuracy','f1'   Regression : 'neg_mean_squared_error','r2'...
    n_jobs= -1,         # 병렬 처리갯수? -1은 전부)
    refit=True          # default가 True. 좋은 estimator로 수정되어짐.
)   

GSmodel = GridSearchCV()
GSmodel.fit( )                 # 훈련 
GSmodel.best_params_           # 최적 파라미터
GSmodel.best_score_            # 최적일때 스코어
model=GSmodel.best_estimator_  # 최적의 파라미터일때 모델 
model.predict()                # 예측
```

In [58]:
rfmodel = RandomForestClassifier(random_state=1)

param_grid={ 'max_depth':[3, 4, 5],
             'n_estimators':[120, 130, 140],
            }

gscv=GridSearchCV(rfmodel, param_grid=param_grid, cv=4, scoring='accuracy')

gscv.fit(x_train,y_train)
print(gscv.best_params_)   # 최적의 파라미터 값 출력
print(gscv.best_score_)      # 최고의 점수

{'max_depth': 4, 'n_estimators': 117}
0.806957742495859


In [59]:
model=gscv.best_estimator_   # 최적 파라미터일때 모델
model.fit(x_train, y_train)
model.score(x_train, y_train)

0.8148148148148148

#### RandomizedSearchCV

In [60]:
rfmodel = RandomForestClassifier(random_state=1)

param_grid={ 'max_depth':[3, 4, 5],
             'n_estimators':[120, 130, 140],
            }

rcv=RandomizedSearchCV(rfmodel, param_distributions=param_grid, n_iter=100, cv=4, scoring='accuracy')

rcv.fit(x_train,y_train)
print(rcv.best_params_)   
print(rcv.best_score_)      

{'n_estimators': 120, 'max_depth': 4}
0.806957742495859


In [61]:
model=rcv.best_estimator_  
model.fit(x_train, y_train)
model.score(x_train, y_train)

0.8148148148148148

#### BayesianOptimization

In [85]:
rf_parameter_bounds = {
                      'max_depth' : (3,5),          # 범위 3~5
                      'n_estimators' : (120,140),
                      }

def rf_bo(max_depth, n_estimators):
    rf_params = {'max_depth' : int(round(max_depth)),
                 'n_estimators' : int(round(n_estimators)),      
                }
    rf = RandomForestClassifier(**rf_params)
    rf.fit(x_train,y_train)
    score = rf.score(x_train, y_train)
    return score

In [86]:
BO_rf = BayesianOptimization(f = rf_bo, pbounds = rf_parameter_bounds, random_state = 0)

In [87]:
BO_rf.maximize(init_points = 10, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.8148  [0m | [0m 4.098   [0m | [0m 134.3   [0m |
| [0m 2       [0m | [0m 0.8126  [0m | [0m 4.206   [0m | [0m 130.9   [0m |
| [0m 3       [0m | [0m 0.8126  [0m | [0m 3.847   [0m | [0m 132.9   [0m |
| [0m 4       [0m | [0m 0.8137  [0m | [0m 3.875   [0m | [0m 137.8   [0m |
| [95m 5       [0m | [95m 0.8159  [0m | [95m 4.927   [0m | [95m 127.7   [0m |
| [0m 6       [0m | [0m 0.8148  [0m | [0m 4.583   [0m | [0m 130.6   [0m |
| [0m 7       [0m | [0m 0.8137  [0m | [0m 4.136   [0m | [0m 138.5   [0m |
| [0m 8       [0m | [0m 0.8081  [0m | [0m 3.142   [0m | [0m 121.7   [0m |
| [0m 9       [0m | [0m 0.8058  [0m | [0m 3.04    [0m | [0m 136.7   [0m |
| [0m 10      [0m | [0m 0.8148  [0m | [0m 4.556   [0m | [0m 137.4   [0m |
| [0m 11      [0m | [0m 0.8148  [0m | [0m 4.858   [0m | [0m 128.8   

In [88]:
max_params = BO_rf.max['params']

max_params['max_depth'] = int(max_params['max_depth'])
max_params['n_estimators'] = int(max_params['n_estimators'])
max_params

{'max_depth': 4, 'n_estimators': 127}

In [89]:
model = RandomForestClassifier(**max_params)
model.fit(x_train, y_train)
model.score(x_train, y_train)

0.813692480359147

## 모델 학습

In [41]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=130, max_depth=3, random_state=1)
model.fit(x_train, y_train)
model.score(x_train, y_train)

0.8080808080808081

## 예측 및 제출

In [None]:
predictions = model.predict(x_test)
predictions

In [None]:
submit = pd.read_csv("./titanic/gender_submission.csv")
submit

In [None]:
submit['Survived'] = predictions
submit

In [None]:
submit.to_csv('submission1.csv', index=False)