In [1]:
import pandas as pd

train = pd.read_csv('./refine_data/train.csv')
test = pd.read_csv('./refine_data/test.csv')

label = train["Survived"]
features =["Sex","Age_category","Pclass","Embarked_0","Embarked_1","Embarked_2","family_cnt","Age",
          "Initial_0","Initial_1","Initial_2","Initial_3","Initial_4"]
train = train[features]
test = test[features]
train.head()

Unnamed: 0,Sex,Age_category,Pclass,Embarked_0,Embarked_1,Embarked_2,family_cnt,Age,Initial_0,Initial_1,Initial_2,Initial_3,Initial_4
0,1,2,3,0,0,1,1,22.0,0,0,1,0,0
1,0,3,1,1,0,0,1,38.0,0,0,0,1,0
2,0,2,3,0,0,1,0,26.0,0,1,0,0,0
3,0,3,1,0,0,1,1,35.0,0,0,0,1,0
4,1,3,3,0,0,1,0,35.0,0,0,1,0,0


### Gradient Boost Machine (Light gbm )

In [2]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=37,n_estimators=100)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y,test_y = train_test_split(train,label, test_size=0.3, random_state=37 )


In [4]:
%time model.fit(train_x,train_y)

CPU times: user 128 ms, sys: 6.49 ms, total: 135 ms
Wall time: 47.2 ms


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=37, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [5]:
y_predict_test_x = model.predict(test_x)

In [6]:
from sklearn import metrics
metrics.accuracy_score(y_predict_test_x, test_y)

0.8134328358208955

In [8]:

prediction_list = model.predict(test)

In [9]:
submission = pd.read_csv('data/sample_submission.csv')
submission["Survived"] = prediction_list
submission.to_csv('./second_submission.csv',index=False)

## Hyperparameter tuning

### coarse  Search  - 상위 5개의 score를 내는 하이퍼파라미터 구간들을 찾는다. Hold out 

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

train_x, test_x, train_y,test_y = train_test_split(train,label, test_size=0.3, random_state=37 )

n_estimators = 150
num_loop =200
early_stopping_rounds = 20
coarse_hyperparameters_list = []

for loop in range(num_loop):
    num_leaves= np.random.randint(2,500)
    max_bin = np.random.randint(2,500) 
    min_child_samples = np.random.randint(2, 500)
    colsample_bytree =np.random.uniform(low= 0.1, high= 1.0)
    learning_rate = 10** np.random.uniform(low = -10, high =1) 
    subsample = np.random.uniform(low= 0.1, high= 1.0)
    model = LGBMClassifier(n_estimators = n_estimators,
                           random_state=37,
                           num_leaves=num_leaves,
                           max_bin=max_bin,
                           colsample_bytree=colsample_bytree,
                           min_child_samples=min_child_samples,
                           learning_rate=learning_rate,
                           subsample=subsample,
                           subsample_freq=1,
                           class_type = 'balacned'
                           )
    model.fit(train_x,train_y,
              eval_set = [(test_x,test_y)],
              verbose = 0,
              early_stopping_rounds = early_stopping_rounds
             )
    
    y_predict_test_x = model.predict(test_x)
    score = metrics.accuracy_score(y_predict_test_x, test_y)
    coarse_hyperparameters_list.append({
        'loop':loop,
        'n_estimators':n_estimators,
        'num_leaves':num_leaves,
        'max_bin':max_bin,
        'colsample_bytree':colsample_bytree,
        'min_child_samples':min_child_samples,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'subsample_freq':1,
        'class_type':'balanced',
        'score': score
    })
coarse_hyperparameters_list = pd.DataFrame(coarse_hyperparameters_list)
coarse_hyperparameters_list.sort_values(by='score', ascending =False).head()

Unnamed: 0,class_type,colsample_bytree,learning_rate,loop,max_bin,min_child_samples,n_estimators,num_leaves,score,subsample,subsample_freq
80,balanced,0.35377,0.017667,80,492,9,150,102,0.828358,0.912096,1
0,balanced,0.790571,0.114064,0,99,42,150,485,0.824627,0.360555,1
152,balanced,0.718513,0.384621,152,352,75,150,257,0.824627,0.887949,1
31,balanced,0.200212,1.203762,31,155,190,150,55,0.809701,0.829286,1
185,balanced,0.308531,0.100283,185,162,79,150,58,0.794776,0.372801,1


## Finer Search - cross validation

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
train_x, test_x, train_y,test_y = train_test_split(train,label, test_size=0.3, random_state=37 )

n_estimators = 150
num_loop =200
early_stopping_rounds = 20
finer_hyperparameters_list = []

for loop in range(num_loop):
    num_leaves= np.random.randint(55,485)
    max_bin = np.random.randint(99,492) 
    min_child_samples = np.random.randint(9, 190)
    colsample_bytree =np.random.uniform(low= 0.200212, high=0.790571)
    learning_rate =  np.random.uniform(low = 0.017667, high =1.203762) 
    subsample = np.random.uniform(low= 0.360555, high= 0.912096)
    
    model = LGBMClassifier(n_estimators = n_estimators,
                           random_state=37,
                           num_leaves=num_leaves,
                           max_bin=max_bin,
                           colsample_bytree=colsample_bytree,
                           min_child_samples=min_child_samples,
                           learning_rate=learning_rate,
                           subsample=subsample,
                           subsample_freq=1,
                           class_type = 'balacned'
                           )
    
    model.fit(train_x,train_y,
              eval_set = [(test_x,test_y)],
              verbose = 0,
              early_stopping_rounds = early_stopping_rounds
             )
    
    y_predict_test_x = model.predict(test_x)
    score = metrics.accuracy_score(y_predict_test_x, test_y)
    finer_hyperparameters_list.append({
        'loop':loop,
        'n_estimators':n_estimators,
        'num_leaves':num_leaves,
        'max_bin':max_bin,
        'colsample_bytree':colsample_bytree,
        'min_child_samples':min_child_samples,
        'learning_rate':learning_rate,
        'subsample':subsample,
        'subsample_freq':1,
        'class_type':'balanced',
        'score': score
    })
finer_hyperparameters_list = pd.DataFrame(finer_hyperparameters_list)
finer_hyperparameters_list.sort_values(by='score', ascending =False).head()

Unnamed: 0,class_type,colsample_bytree,learning_rate,loop,max_bin,min_child_samples,n_estimators,num_leaves,score,subsample,subsample_freq
102,balanced,0.226141,0.535968,102,104,12,150,194,0.850746,0.766042,1
133,balanced,0.226118,0.267259,133,190,35,150,77,0.850746,0.722693,1
9,balanced,0.755244,0.529725,9,233,39,150,350,0.843284,0.848219,1
69,balanced,0.375256,0.126971,69,248,37,150,390,0.843284,0.890085,1
171,balanced,0.440286,0.444074,171,363,24,150,323,0.839552,0.620925,1


In [12]:
model = LGBMClassifier(n_estimators = 100,
                       random_state=37,
                       num_leaves=194,
                       max_bin=104,
                       colsample_bytree=0.226141,
                       min_child_samples=12,
                       learning_rate=0.535968,
                       subsample=0.766042,
                       subsample_freq=1,
                       class_type = 'balacned')

In [13]:
from sklearn import metrics
%time model.fit(train_x,train_y)
y_predict_test_x = model.predict(test_x)
score = metrics.accuracy_score(y_predict_test_x, test_y)
print(score)

CPU times: user 134 ms, sys: 8.09 ms, total: 142 ms
Wall time: 66.3 ms
0.8171641791044776


In [14]:
submission = pd.read_csv('data/sample_submission.csv')
predict_test = model.predict(test)
submission["Survived"] = predict_test
submission.to_csv('./third_submission.csv',index=False)