In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import xgboost
import lightgbm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection._split import _CVIterableWrapper as cv_wrapper

In [2]:
def get_train_valid_x_y(train_df, fold):
    kf = KFold(n_splits=5, shuffle=True, random_state=519) #should follow how we split data when training resnet
    counter = 1
    
    features = train_df.columns.tolist()
    features.remove('is_iceberg')

    for train_index, valid_index in kf.split(train_df):
        if counter != fold:
            counter += 1
            continue
        
        #print(train_index, valid_index)
        train = train_df.loc[train_index].reset_index(drop=True)
        valid = train_df.loc[valid_index].reset_index(drop=True)
    
        _train_X = train[features]
        _valid_X = valid[features]
        
        _train_y = train['is_iceberg']
        _valid_y = valid['is_iceberg']
        return _train_X, _train_y, _valid_X, _valid_y

In [3]:
# concat all train, valid test
wrapper_list = []

all_train_X = []
all_train_y = []

for i in range(5):
    fold = i+1
    df = pd.read_json('Data/fine_tune_resnet_extract_features_fold_{}_40_avg_train.json'.format(fold))
    _train_X, _train_y, _valid_X, _valid_y = get_train_valid_x_y(df, fold)
    f_num = _train_X.shape[1]
    columns = ['f_{}'.format(x) for x in range(f_num)]
    _train_X.columns = columns
    _valid_X.columns = columns
    
    train_size = _train_X.shape[0]
    print("Fold {} train size= {}".format(fold, train_size))
    wrapper_list.append((np.array([x for x in range(1604*i, 1604*i+train_size)]),
                         np.array([x for x in range(1604*i+train_size, 1604*(i+1))])))
    
    all_train_X.append(_train_X)
    all_train_X.append(_valid_X)
    all_train_y.append(_train_y)
    all_train_y.append(_valid_y)
    

all_train_X = pd.concat(all_train_X, ignore_index=True)
all_train_y = pd.concat(all_train_y, ignore_index=True)

print(all_train_X)
print(all_train_y)

Fold 1 train size= 1283
Fold 2 train size= 1283
Fold 3 train size= 1283
Fold 4 train size= 1283
Fold 5 train size= 1284
      f_0  f_1  f_2  f_3       f_4  f_5  f_6  f_7  f_8  f_9   ...     f_18423  \
0       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
1       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
2       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
3       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
4       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
5       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
6       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
7       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
8       0    0    0  0.0  0.000000    0    0    0    0    0   ...           0   
9       0    0    0  0.0  0.000000    0    0    0    0    0   ...     

In [4]:
print(pd.Series(all_train_X.iloc[0] != 0.0).value_counts())
print(pd.Series(all_train_X.iloc[1] != 0.0).value_counts())

False    17252
True      1181
Name: 0, dtype: int64
False    17226
True      1207
Name: 1, dtype: int64


> the feauture vectors are quite sparse

In [5]:
cv = cv_wrapper(np.array(wrapper_list))

In [19]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(whiten=False, random_state=0, n_components=20)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__boosting_type': ['dart', 'gbdt']
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=cv)
gs.fit(all_train_X, all_train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__boosting_type': 'dart'} -0.237673570569
   mean_train_score  mean_test_score  std_test_score  \
0         -0.083273        -0.237674        0.018258   
1         -0.041782        -0.268957        0.025622   

                           params  rank_test_score  
0  {'lgb__boosting_type': 'dart'}                1  
1  {'lgb__boosting_type': 'gbdt'}                2  


> using 'dart' is significantly better than gdbt

In [24]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='dart')
pca = PCA(whiten=False, svd_solver='randomized', random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [100*(i+1) for i in range(6)]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=cv)
gs.fit(all_train_X, all_train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'pca__n_components': 100} -0.242447315818
   mean_train_score  mean_test_score  std_test_score  \
0         -0.070600        -0.242447        0.020966   
5         -0.060977        -0.242662        0.019822   
3         -0.062777        -0.244402        0.024463   
4         -0.062040        -0.244457        0.022597   
1         -0.066607        -0.246123        0.022009   
2         -0.064654        -0.247003        0.022021   

                       params  rank_test_score  
0  {'pca__n_components': 100}                1  
5  {'pca__n_components': 600}                2  
3  {'pca__n_components': 400}                3  
4  {'pca__n_components': 500}                4  
1  {'pca__n_components': 200}                5  
2  {'pca__n_components': 300}                6  
[{'pca__n_components': 100}, {'pca__n_components': 600}, {'pca__n_components': 400}, {'pca__n_components': 500}, {'pca__n_components': 200}, {'pca__n_components': 300}]


In [25]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='dart')
pca = PCA(whiten=True, random_state=0, svd_solver='randomized')
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [560+i*20 for i in range(5)]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=cv)
gs.fit(all_train_X, all_train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

KeyboardInterrupt: 

In [15]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='dart')
pca = PCA(whiten=True, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [20, 390],
    'lgb__n_estimators': [100, 200, 400, 800],
    'lgb__learning_rate': [0.1, 0.05, 0.01, 0.005]
}

print(type(all_train_X))
print(type(all_train_y))
gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=cv)
gs.fit(all_train_X, all_train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
{'lgb__learning_rate': 0.1, 'lgb__n_estimators': 100, 'pca__n_components': 20} -0.237673570569
    mean_train_score  mean_test_score  std_test_score  \
0          -0.083273        -0.237674        0.018258   
10         -0.100669        -0.238533        0.015032   
1          -0.062366        -0.239444        0.024388   
11         -0.082531        -0.241907        0.020933   
13         -0.044026        -0.244498        0.027488   
12         -0.067270        -0.246748        0.019540   
2          -0.061771        -0.249235        0.021508   
3          -0.038066        -0.251301        0.030389   
22         -0.135910        -0.251768        0.013660   
23         -0.121264        -0.253195        0.017965   
4          -0.047717        -0.262772        0.025020   
8          -0.158429        -0.263561        0.013576   
9          -0.145989        -0.264922        0.016823   
14         -0.046328        -0.26

In [17]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, learning_rate=.1, 
                                 boosting_type='dart')
pca = PCA(whiten=True, random_state=0, n_components=20)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__num_leaves': [2, 3, 7, 15, 31, 63],
    'lgb__min_split_gain': [.0, .1, .2]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=cv)
gs.fit(all_train_X, all_train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__min_split_gain': 0.0, 'lgb__num_leaves': 3} -0.234742360186
    mean_train_score  mean_test_score  std_test_score  \
1          -0.122813        -0.234742        0.019021   
7          -0.122813        -0.234742        0.019021   
13         -0.122813        -0.234742        0.019021   
16         -0.083814        -0.236455        0.017195   
17         -0.083814        -0.236455        0.017195   
15         -0.083712        -0.237609        0.017668   
2          -0.093388        -0.237648        0.015024   
8          -0.093388        -0.237648        0.015024   
4          -0.083273        -0.237674        0.018258   
5          -0.083273        -0.237674        0.018258   
9          -0.083378        -0.237865        0.016891   
14         -0.093289        -0.237944        0.015226   
0          -0.136510        -0.238243        0.020698   
6          -0.136510        -0.238243        0.020698   
12         -0.136510        -0.238243        0.020698   
3          -0.083144 

In [18]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, learning_rate=.1, 
                                 boosting_type='dart',
                                 num_leaves=3, min_split_gain=.0)
pca = PCA(whiten=True, random_state=0, n_components=20)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [20, 390],
    'lgb__subsample': [1., .8, .6, .4, .2],
    'lgb__colsample_bytree': [1., .8, .6, .4, .2]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=cv)
gs.fit(all_train_X, all_train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__colsample_bytree': 1.0, 'lgb__subsample': 1.0, 'pca__n_components': 390} -0.233943722205
    mean_train_score  mean_test_score  std_test_score  \
1          -0.120685        -0.233944        0.019673   
4          -0.121353        -0.234732        0.020764   
0          -0.122813        -0.234742        0.019021   
6          -0.121549        -0.235431        0.021394   
2          -0.121794        -0.236555        0.019110   
5          -0.116448        -0.236735        0.021765   
7          -0.116967        -0.236866        0.020460   
3          -0.118194        -0.237633        0.020904   
8          -0.130447        -0.238985        0.022267   
11         -0.127287        -0.239019        0.020614   
15         -0.125580        -0.239502        0.019693   
14         -0.131172        -0.239630        0.019359   
18         -0.136069        -0.239760        0.021522   
19         -0.132607        -0.240193        0.020267   
13         -0.126230        -0.240356        0.01

## Fold 1

In [4]:
df = pd.read_json('Data/fine_tune_resnet_extract_features_fold_1_1_train_after_pool.json')
features = df.columns.tolist()
features.remove('is_iceberg')

train_X, train_y = df[features], df['is_iceberg']
print(train_X.shape, train_y.shape)
print(train_X['f_1_2048'][:10], train_y[:10])

(1604, 2049) (1604,)
0      -0.264428
1      -0.090121
10     -0.285586
100    -0.180120
1000    0.038582
1001   -0.144716
1002   -0.065031
1003   -0.223179
1004   -0.157804
1005   -0.111200
Name: f_1_2048, dtype: float64 0       0
1       0
10      1
100     0
1000    0
1001    1
1002    0
1003    1
1004    1
1005    1
Name: is_iceberg, dtype: int64


In [5]:
_train_X, _train_y, _valid_X, _valid_y = get_train_valid_x_y(df, 1)
print(_train_X.shape, _train_y.shape, _valid_X.shape, _valid_y.shape)

(1283, 2049) (1283,) (321, 2049) (321,)


In [7]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(whiten=True, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [30+i*50 for i in range(10)]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'pca__n_components': 280} -0.172237341637
   mean_train_score  mean_test_score  std_test_score  \
5         -0.095155        -0.172237        0.044862   
0         -0.043552        -0.173203        0.045214   
3         -0.065541        -0.175031        0.046439   
2         -0.040848        -0.176403        0.046067   
7         -0.110812        -0.176792        0.048987   
9         -0.123473        -0.178733        0.045590   
6         -0.110137        -0.181626        0.048134   
4         -0.087154        -0.181970        0.052726   
1         -0.033689        -0.188007        0.042153   
8         -0.124659        -0.190243        0.059148   

                       params  rank_test_score  
5  {'pca__n_components': 280}                1  
0   {'pca__n_components': 30}                2  
3  {'pca__n_components': 180}                3  
2  {'pca__n_components': 130}                4  
7  {'pca__n_components': 380}                5  
9  {'pca__n_components': 480}                6

In [12]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [10+i*10 for i in range(30)]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'pca__n_components': 10} -0.170613010669
    mean_train_score  mean_test_score  std_test_score  \
0          -0.064050        -0.170613        0.044621   
2          -0.043552        -0.173203        0.045214   
1          -0.049473        -0.176347        0.047961   
9          -0.031253        -0.176864        0.040942   
3          -0.039764        -0.177184        0.046765   
11         -0.030647        -0.177651        0.046515   
21         -0.030648        -0.177651        0.046515   
29         -0.030647        -0.177651        0.046515   
10         -0.030948        -0.178991        0.043845   
12         -0.030671        -0.179267        0.047664   
13         -0.030671        -0.179267        0.047664   
14         -0.030671        -0.179267        0.047664   
15         -0.030671        -0.179267        0.047664   
16         -0.030671        -0.179267        0.047664   
17         -0.030671        -0.179267        0.047664   
18         -0.030671        -0.179267        0

In [25]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__boosting_type': ['dart', 'gbdt']
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__boosting_type': 'gbdt'} -0.160621793777
   mean_train_score  mean_test_score  std_test_score  \
1         -0.024836        -0.160622        0.027537   
0         -0.072585        -0.168783        0.021674   

                           params  rank_test_score  
1  {'lgb__boosting_type': 'gbdt'}                1  
0  {'lgb__boosting_type': 'dart'}                2  


In [26]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='gbdt')
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__n_estimators': [100, 200, 400, 800],
    'lgb__learning_rate': [0.1, 0.05, 0.01, 0.005]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__learning_rate': 0.05, 'lgb__n_estimators': 100} -0.1603134502
    mean_train_score  mean_test_score  std_test_score  \
4          -0.048378        -0.160313        0.026541   
0          -0.024836        -0.160622        0.027537   
10         -0.061168        -0.160849        0.024615   
15         -0.061251        -0.160862        0.024726   
11         -0.030752        -0.161920        0.027969   
5          -0.025015        -0.165092        0.028876   
1          -0.015752        -0.171140        0.031889   
6          -0.015882        -0.173989        0.033041   
2          -0.010933        -0.184134        0.036497   
3          -0.010671        -0.185266        0.036464   
7          -0.011079        -0.186781        0.038424   
14         -0.133554        -0.197044        0.017977   
9          -0.133449        -0.197055        0.017894   
8          -0.259127        -0.298071        0.009957   
13         -0.259418        -0.298097        0.009948   
12         -0.40150

In [29]:
print(cv_df['params'].tolist())

[{'lgb__learning_rate': 0.05, 'lgb__n_estimators': 100}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 100}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 100}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 100}]


In [32]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt')
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__num_leaves': [2, 3, 7, 15, 31, 63],
    'lgb__min_child_samples': [1, 2, 3, 5, 10, 20]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__min_child_samples': 1, 'lgb__num_leaves': 15} -0.160796978924
    mean_train_score  mean_test_score  std_test_score  \
3          -0.061306        -0.160797        0.024892   
9          -0.061306        -0.160797        0.024892   
15         -0.061306        -0.160797        0.024892   
21         -0.061306        -0.160797        0.024892   
27         -0.061306        -0.160797        0.024892   
33         -0.061306        -0.160797        0.024892   
4          -0.061168        -0.160849        0.024615   
10         -0.061168        -0.160849        0.024615   
16         -0.061168        -0.160849        0.024615   
22         -0.061168        -0.160849        0.024615   
28         -0.061168        -0.160849        0.024615   
34         -0.061168        -0.160849        0.024615   
5          -0.061240        -0.160872        0.024788   
11         -0.061240        -0.160872        0.024788   
17         -0.061240        -0.160872        0.024788   
23         -0.06124

In [34]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20)
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__min_split_gain': [.0, .1, .2]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__min_split_gain': 0.0} -0.160796978924
   mean_train_score  mean_test_score  std_test_score  \
0         -0.061306        -0.160797        0.024892   
2         -0.061640        -0.160943        0.025345   
1         -0.061413        -0.161229        0.025040   

                         params  rank_test_score  
0  {'lgb__min_split_gain': 0.0}                1  
2  {'lgb__min_split_gain': 0.2}                2  
1  {'lgb__min_split_gain': 0.1}                3  
[{'lgb__min_split_gain': 0.0}, {'lgb__min_split_gain': 0.2}, {'lgb__min_split_gain': 0.1}]


In [37]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20)
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__subsample': [1., .8, .6, .4, .2],
    'lgb__colsample_bytree': [1., .8, .6, .4, .2]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__colsample_bytree': 1.0, 'lgb__subsample': 0.6} -0.160344603287
    mean_train_score  mean_test_score  std_test_score  \
2          -0.086357        -0.160345        0.022548   
1          -0.070841        -0.160618        0.022685   
3          -0.108941        -0.160673        0.020157   
0          -0.061306        -0.160797        0.024892   
6          -0.073826        -0.161705        0.020773   
5          -0.063521        -0.162729        0.022228   
7          -0.089313        -0.162748        0.020533   
8          -0.111012        -0.164228        0.019936   
4          -0.143743        -0.168632        0.021926   
11         -0.080486        -0.169848        0.017525   
10         -0.070532        -0.170140        0.018729   
9          -0.144861        -0.171756        0.020095   
12         -0.095896        -0.172152        0.017129   
13         -0.118138        -0.174575        0.018358   
14         -0.148733        -0.178606        0.018206   
15         -0.0815

In [38]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20)
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__subsample': [.75, .7, .65, .6, .55, .5, .45],
    'lgb__colsample_bytree': [1., .95, .9, .85]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__colsample_bytree': 1.0, 'lgb__subsample': 0.7} -0.160154755243
    mean_train_score  mean_test_score  std_test_score  \
1          -0.078058        -0.160155        0.023146   
3          -0.086357        -0.160345        0.022548   
2          -0.081982        -0.160623        0.022611   
7          -0.074940        -0.160795        0.023629   
4          -0.091300        -0.160840        0.021938   
10         -0.087123        -0.160857        0.021523   
11         -0.091537        -0.160861        0.021743   
6          -0.102463        -0.160957        0.020656   
9          -0.082694        -0.161088        0.022387   
8          -0.078788        -0.161225        0.022055   
0          -0.074397        -0.161225        0.023893   
17         -0.087771        -0.161303        0.022078   
12         -0.096996        -0.161382        0.020816   
18         -0.092080        -0.161513        0.021177   
5          -0.096489        -0.161550        0.021475   
25         -0.0934

In [40]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20,
                                 colsample_bytree=1., subsample=.6)
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__reg_alpha': [1., .8, .6, .4, .2, .1, .0],
    'lgb__reg_lambda': [1., .8, .6, .4, .2, .1, .0]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__reg_alpha': 0.0, 'lgb__reg_lambda': 0.0} -0.160344603287
    mean_train_score  mean_test_score  std_test_score  \
48         -0.086357        -0.160345        0.022548   
40         -0.088826        -0.160508        0.021592   
47         -0.087577        -0.160535        0.022129   
41         -0.087964        -0.160651        0.022009   
34         -0.088987        -0.160708        0.021541   
46         -0.088639        -0.160731        0.021691   
39         -0.089770        -0.161092        0.021495   
32         -0.090940        -0.161259        0.021701   
45         -0.090636        -0.161381        0.021758   
33         -0.089849        -0.161530        0.021834   
27         -0.091310        -0.161579        0.022265   
26         -0.092146        -0.161636        0.022425   
25         -0.093018        -0.161818        0.021976   
20         -0.093303        -0.161911        0.022097   
31         -0.092631        -0.162146        0.021785   
30         -0.094272    

In [59]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20,
                                 colsample_bytree=1., subsample=.6)
pca = PCA(whiten=True, n_components=330, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__reg_alpha': [.075, .05, .025, .0],
    'lgb__reg_lambda': [.075, .05, .025, .0]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__reg_alpha': 0.025, 'lgb__reg_lambda': 0.025} -0.159919542038
    mean_train_score  mean_test_score  std_test_score  \
10         -0.087096        -0.159920        0.022321   
7          -0.087075        -0.159978        0.022171   
3          -0.087509        -0.160002        0.022148   
11         -0.086825        -0.160022        0.022322   
9          -0.087364        -0.160214        0.022107   
13         -0.087030        -0.160267        0.022073   
15         -0.086357        -0.160345        0.022548   
14         -0.086707        -0.160376        0.022684   
5          -0.087773        -0.160390        0.021955   
6          -0.087447        -0.160468        0.022237   
12         -0.087370        -0.160481        0.022317   
0          -0.088301        -0.160564        0.021681   
8          -0.087665        -0.160575        0.021822   
2          -0.087853        -0.160583        0.022135   
4          -0.088037        -0.160672        0.021766   
1          -0.087946

In [7]:
pca = PCA(whiten=True, n_components=330, random_state=0)
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20,
                                 colsample_bytree=1., subsample=.6,
                                 reg_alpha=.025, reg_lambda=.025)

pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

pipe.fit(_train_X, _train_y)
pred_prob = pipe.predict_proba(_valid_X)[:,1].reshape((_valid_X.shape[0],))
pred = pipe.predict(_valid_X)
print('accuracy:', accuracy_score(_valid_y, pred))
print('loss:', log_loss(_valid_y, pred_prob))

accuracy: 0.909657320872
loss: 0.236311498919


## Fold 2

In [12]:
df = pd.read_json('Data/fine_tune_resnet_extract_features_fold_2_1_train.json')
features = df.columns.tolist()
features.remove('is_iceberg')

train_X, train_y = df[features], df['is_iceberg']
print(train_X.shape, train_y.shape)
print(train_X['f_2_18432'][:10], train_y[:10])

(1604, 18433) (1604,)
0       0.367221
1      -0.105657
10      0.424620
100     0.138501
1000   -0.454816
1001    0.042453
1002   -0.173723
1003    0.255316
1004    0.077962
1005   -0.048471
Name: f_2_18432, dtype: float64 0       0
1       0
10      1
100     0
1000    0
1001    1
1002    0
1003    1
1004    1
1005    1
Name: is_iceberg, dtype: int64


In [14]:
_train_X, _train_y, _valid_X, _valid_y = get_train_valid_x_y(df, 2)
print(_train_X.shape, _train_y.shape, _valid_X.shape, _valid_y.shape)

(1283, 18433) (1283,) (321, 18433) (321,)


In [45]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(whiten=True, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [30+i*50 for i in range(10)]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'pca__n_components': 430} -0.118794381523
   mean_train_score  mean_test_score  std_test_score  \
8         -0.020917        -0.118794        0.030875   
9         -0.020762        -0.119275        0.029358   
7         -0.021412        -0.121521        0.030777   
6         -0.021777        -0.126184        0.035457   
1         -0.029396        -0.131546        0.029263   
5         -0.021970        -0.131922        0.033164   
4         -0.023382        -0.132481        0.033208   
3         -0.024964        -0.135118        0.031582   
0         -0.039234        -0.135197        0.030768   
2         -0.026504        -0.138934        0.027608   

                       params  rank_test_score  
8  {'pca__n_components': 430}                1  
9  {'pca__n_components': 480}                2  
7  {'pca__n_components': 380}                3  
6  {'pca__n_components': 330}                4  
1   {'pca__n_components': 80}                5  
5  {'pca__n_components': 280}                6

In [46]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(whiten=True, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'pca__n_components': [380+i*10 for i in range(11)]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'pca__n_components': 430} -0.118794381523
    mean_train_score  mean_test_score  std_test_score  \
5          -0.020917        -0.118794        0.030875   
10         -0.020762        -0.119275        0.029358   
3          -0.021134        -0.119509        0.030111   
8          -0.021085        -0.120669        0.026993   
4          -0.021320        -0.121130        0.030955   
9          -0.020690        -0.121203        0.028870   
0          -0.021412        -0.121521        0.030777   
7          -0.021016        -0.122089        0.027730   
6          -0.020848        -0.123778        0.032722   
2          -0.021479        -0.124274        0.031638   
1          -0.021251        -0.124986        0.031593   

                        params  rank_test_score  
5   {'pca__n_components': 430}                1  
10  {'pca__n_components': 480}                2  
3   {'pca__n_components': 410}                3  
8   {'pca__n_components': 460}                4  
4   {'pca__n_component

In [47]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100)
pca = PCA(whiten=True, n_components=430, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__boosting_type': ['dart', 'gbdt']
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__boosting_type': 'gbdt'} -0.118794381523
   mean_train_score  mean_test_score  std_test_score  \
1         -0.020917        -0.118794        0.030875   
0         -0.060134        -0.127591        0.022734   

                           params  rank_test_score  
1  {'lgb__boosting_type': 'gbdt'}                1  
0  {'lgb__boosting_type': 'dart'}                2  


In [12]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='gbdt')
pca = PCA(whiten=True, n_components=430, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__n_estimators': [100, 200, 400, 800],
    'lgb__learning_rate': [0.1, 0.05, 0.01, 0.005]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)

{'lgb__learning_rate': 0.1, 'lgb__n_estimators': 100} -0.118794381523
    mean_train_score  mean_test_score  std_test_score  \
0          -0.020917        -0.118794        0.030875   
11         -0.025836        -0.119424        0.029511   
4          -0.039585        -0.120264        0.025970   
5          -0.020894        -0.120341        0.030403   
15         -0.050257        -0.120934        0.024536   
10         -0.050202        -0.121294        0.024563   
1          -0.013551        -0.121558        0.034056   
2          -0.010991        -0.122876        0.033931   
3          -0.010991        -0.122876        0.033931   
6          -0.013634        -0.123019        0.034495   
7          -0.011002        -0.124163        0.034281   
9          -0.117962        -0.164887        0.015885   
14         -0.118086        -0.164996        0.015905   
8          -0.245252        -0.272681        0.010932   
13         -0.245548        -0.272826        0.010875   
12         -0.3908

In [13]:
print(cv_df['params'].tolist())

[{'lgb__learning_rate': 0.1, 'lgb__n_estimators': 100}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 100}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.1, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.05, 'lgb__n_estimators': 800}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 400}, {'lgb__learning_rate': 0.01, 'lgb__n_estimators': 100}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 200}, {'lgb__learning_rate': 0.005, 'lgb__n_estimators': 100}]


In [17]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, learning_rate=0.1, 
                                 boosting_type='gbdt')
pca = PCA(whiten=True, n_components=430, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__num_leaves': [2, 3, 7, 15, 31],
    'lgb__min_child_samples': [1, 2, 3, 5, 10]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__min_child_samples': 1, 'lgb__num_leaves': 31} -0.118794381523
    mean_train_score  mean_test_score  std_test_score  \
4          -0.020917        -0.118794        0.030875   
9          -0.020917        -0.118794        0.030875   
14         -0.020917        -0.118794        0.030875   
19         -0.020917        -0.118794        0.030875   
24         -0.020917        -0.118794        0.030875   
3          -0.020897        -0.120086        0.030825   
8          -0.020897        -0.120086        0.030825   
13         -0.020897        -0.120086        0.030825   
18         -0.020897        -0.120086        0.030825   
23         -0.020897        -0.120086        0.030825   
0          -0.073757        -0.120910        0.020661   
5          -0.073757        -0.120910        0.020661   
10         -0.073757        -0.120910        0.020661   
15         -0.073757        -0.120910        0.020661   
20         -0.073757        -0.120910        0.020661   
1          -0.04077

In [20]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, learning_rate=0.1, 
                                 boosting_type='gbdt',
                                 num_leaves=31, min_child_samples=1)
pca = PCA(whiten=True, n_components=430, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__min_split_gain': [.0, .1, .2]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

{'lgb__min_split_gain': 0.0} -0.118794381523
   mean_train_score  mean_test_score  std_test_score  \
0         -0.020917        -0.118794        0.030875   
2         -0.020997        -0.120221        0.029969   
1         -0.020968        -0.121578        0.032913   

                         params  rank_test_score  
0  {'lgb__min_split_gain': 0.0}                1  
2  {'lgb__min_split_gain': 0.2}                2  
1  {'lgb__min_split_gain': 0.1}                3  
[{'lgb__min_split_gain': 0.0}, {'lgb__min_split_gain': 0.2}, {'lgb__min_split_gain': 0.1}]


In [21]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, learning_rate=0.1, 
                                 boosting_type='gbdt',
                                 num_leaves=31, min_child_samples=1)
pca = PCA(whiten=True, n_components=430, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__subsample': [1., .8, .6, .4, .2],
    'lgb__colsample_bytree': [1., .8, .6, .4, .2]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

KeyboardInterrupt: 

In [None]:
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=31, min_child_samples=1,
                                 colsample_bytree=1., subsample=.6)
pca = PCA(whiten=True, n_components=430, random_state=0)
pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

params = {
    'lgb__reg_alpha': [1., .8, .6, .4, .2, .1, .0],
    'lgb__reg_lambda': [1., .8, .6, .4, .2, .1, .0]
}

gs = GridSearchCV(pipe, params, scoring='neg_log_loss', cv=kfold)
gs.fit(_train_X, _train_y)
print(gs.best_params_ , gs.best_score_)
cv_df = pd.DataFrame().from_dict(gs.cv_results_)
cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
cv_df.sort_values(by=['rank_test_score', 'std_test_score'], inplace=True)
print(cv_df)
print(cv_df['params'].tolist())

In [15]:
pca = PCA(whiten=True, n_components=430, random_state=0)
lg_clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=100, 
                                 boosting_type='gbdt')

pipe = Pipeline([
    ('pca', pca),
    ('lgb', lg_clf)
])

pipe.fit(_train_X, _train_y)
pred_prob = pipe.predict_proba(_valid_X)[:,1].reshape((_valid_X.shape[0],))
pred = pipe.predict(_valid_X)
print('accuracy:', accuracy_score(_valid_y, pred))
print('loss:', log_loss(_valid_y, pred_prob))

accuracy: 0.915887850467
loss: 0.276806026473


In [None]:
all_pca = [
    PCA(whiten=True, n_components=330, random_state=0),
]

all_lg_clf = [
    lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, 
                                 n_estimators=400, learning_rate=0.01, 
                                 boosting_type='gbdt',
                                 num_leaves=15, min_child_samples=20,
                                 colsample_bytree=1., subsample=.6,
                                 reg_alpha=.0, reg_lambda=.0)
]