In [None]:
import pandas as pd

train = pd.read_csv('./refine_data/train.csv')
test = pd.read_csv('./refine_data/test.csv')

label = train["Survived"]
features =["Sex","Age_category","Pclass","Embarked_0","Embarked_1","Embarked_2","family_cnt","Age"]
train = train[features]
test = test[features]


# Todo
# name의 성으로 연령대,성별 유추해보기
# Age null - > mean 값으로 처리하기
# fare attribute 추가해보기, null 데이터인지 그리고 normal 분포를 따르는지 체크. 


### Gradient Boost Machine (Light gbm )

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=37,n_estimators=100)

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y,test_y = train_test_split(train,label, test_size=0.3, random_state=37 )


In [None]:
%time model.fit(train_x,train_y)

In [None]:
y_predict_test_x = model.predict(test_x)

In [None]:
from sklearn import metrics

metrics.accuracy_score(y_predict_test_x, test_y)

In [None]:

prediction_list = model.predict(test)

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission["Survived"] = prediction_list
submission.to_csv('./second_submission.csv',index=False)

## Hyperparameter tuning

### coarse  Search  - 상위 5개의 score를 내는 하이퍼파라미터 구간들을 찾는다. Hold out 

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
train_x, test_x, train_y,test_y = train_test_split(train,label, test_size=0.3, random_state=37 )

n_estimators = 100
num_loop =100
early_stopping_rounds = 20
coarse_hyperparameters_list = []

for loop in range(num_loop):
    num_leaves= np.random.randint(2,500)
    max_bin = np.random.randint(2,500) 
    min_child_samples = np.random.randint(2, 500)
    colsample_bytree =np.random.uniform(low= 0.1, high= 1.0)
    learning_rate = 10** np.random.uniform(low = -10, high =1) 
    subsample = np.random.uniform(low= 0.1, high= 1.0)
    model = LGBMClassifier(n_estimators = n_estimators,
                           random_state=37,
                           num_leaves=num_leaves,
                           max_bin=max_bin,
                           colsample_bytree=colsample_bytree,
                           min_child_samples=min_child_samples,
                           learning_rate=learning_rate,
                           subsample=subsample,
                           subsample_freq=1,
                           class_type = 'balacned'
                           )
    model.fit(train_x,train_y,
              eval_set = [(test_x,test_y)],
              verbose = 0,
              early_stopping_rounds = early_stopping_rounds
             )
    
    y_predict_test_x = model.predict(test_x)
    score = metrics.accuracy_score(y_predict_test_x, test_y)
    coarse_hyperparameters_list.append({
        'loop':loop,
        'n_estimators':n_estimators,
        'num_leaves':num_leaves,
        'max_bin':max_bin,
        'colsample_bytree':colsample_bytree,
        'min_child_samples':min_child_samples,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'subsample_freq':1,
        'class_type':'balanced',
        'score': score
    })
coarse_hyperparameters_list = pd.DataFrame(coarse_hyperparameters_list)
coarse_hyperparameters_list.sort_values(by='score', ascending =False).head()

## Finer Search - cross validation

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
train_x, test_x, train_y,test_y = train_test_split(train,label, test_size=0.3, random_state=37 )

n_estimators = 100
num_loop =100
early_stopping_rounds = 20
finer_hyperparameters_list = []

for loop in range(num_loop):
    num_leaves= np.random.randint(198,462)
    max_bin = np.random.randint(70,290) 
    min_child_samples = np.random.randint(77, 146)
    colsample_bytree =np.random.uniform(low= 0.25, high= 0.5)
    learning_rate =  np.random.uniform(low = 0.01, high =1.32) 
    subsample = np.random.uniform(low= 0.39	, high= 0.92)
    
    model = LGBMClassifier(n_estimators = n_estimators,
                           random_state=37,
                           num_leaves=num_leaves,
                           max_bin=max_bin,
                           colsample_bytree=colsample_bytree,
                           min_child_samples=min_child_samples,
                           learning_rate=learning_rate,
                           subsample=subsample,
                           subsample_freq=1,
                           class_type = 'balacned'
                           )
    
    model.fit(train_x,train_y,
              eval_set = [(test_x,test_y)],
              verbose = 0,
              early_stopping_rounds = early_stopping_rounds
             )
    
    y_predict_test_x = model.predict(test_x)
    score = metrics.accuracy_score(y_predict_test_x, test_y)
    finer_hyperparameters_list.append({
        'loop':loop,
        'n_estimators':n_estimators,
        'num_leaves':num_leaves,
        'max_bin':max_bin,
        'colsample_bytree':colsample_bytree,
        'min_child_samples':min_child_samples,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'subsample_freq':1,
        'class_type':'balanced',
        'score': score
    })
finer_hyperparameters_list = pd.DataFrame(finer_hyperparameters_list)
finer_hyperparameters_list.sort_values(by='score', ascending =False).head()

In [None]:
model = LGBMClassifier(n_estimators = 100,
                       random_state=37,
                       num_leaves=325,
                       max_bin=211,
                       colsample_bytree=0.416852,
                       min_child_samples=84,
                       learning_rate=1.022105,
                       subsample=0.906650,
                       subsample_freq=1,
                       class_type = 'balacned')

In [None]:
from sklearn import metrics
%time model.fit(train_x,train_y)
y_predict_test_x = model.predict(test_x)
score = metrics.accuracy_score(y_predict_test_x, test_y)
print(score)

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission["Survived"] = prediction_list
submission.to_csv('./third_submission.csv',index=False)