In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import xgboost
import lightgbm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score
import copy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier

kfold = KFold(5, shuffle=True, random_state=999)

In [2]:
def get_train_df(top_percent=.9):
    train_X = pd.read_csv('Data/saak_transformed_train_energy_90.csv')
    train_y = np.array(train_X['is_iceberg']).reshape((train_X.shape[0],))
    
    columns = f_test_sort(train_X, top_percent=top_percent)
    train_X = train_X[columns]
    #train_X.drop(columns=['is_iceberg'], inplace=True)
    
    print("Train data X: {}, y: {}".format(train_X.shape, train_y.shape))
    return train_X, train_y, columns

In [3]:
import operator
    
def f_test_sort(train_X, top_percent=.9):
    features = train_X.columns.tolist()
    features.remove('is_iceberg')
    overall_mean = train_X[features].mean(axis=0)
    
    class_train = [train_X[train_X.is_iceberg == label] for label in [0,1]]
    print(class_train[0].shape, class_train[1].shape)
    
    f_dict = {}
    for i,f in enumerate(features):
        bgv = 0.0
        wgv = 0.0
        for c in [0,1]:
            bgv += class_train[c].shape[0]*(class_train[c][f].mean() - overall_mean[i])**2
            wgv += ((class_train[c][f] - class_train[c][f].mean())**2).sum()
        wgv /= train_X.shape[0] - 2
        f_dict[f] = bgv/wgv
    
    f_dict_sorted = sorted(f_dict.items(), key=operator.itemgetter(1), reverse=True)
    
    select_num = int(top_percent*len(f_dict_sorted))
    select_cols = [col for (col, val) in f_dict_sorted[:select_num]]
    return select_cols

In [4]:
def get_test_df(cols=None):
    if cols is None:
        test_df = pd.read_csv('Data/saak_transformed_test_energy_90.csv')
    else:
        test_df = pd.read_csv('Data/saak_transformed_test_energy_90.csv', usecols=cols)
        
    print("Test data X: {}".format(test_df.shape))
    return test_df

In [5]:
def get_feature_num(pca, th):
    counts = pd.Series(pca.explained_variance_ >= th).value_counts(sort=False)
    return counts.values[1]

In [6]:
def get_transformed_df(pca, df):
    new_data = np.array(pca.transform(df))
    columns = ['f_{}'.format(i) for i in range(new_data.shape[1])]
    df = pd.DataFrame(data=new_data,    # values
                  index=df.index,
                  columns=columns)  # 1st row as the column names
    return df

In [7]:
def get_log_loss(fn, df, y):
    pca = PCA(whiten=False, svd_solver='randomized', random_state=0, n_components=fn)
    pca.fit(df)
    
    X = get_transformed_df(pca, df)
    
    clf = lightgbm.LGBMClassifier(n_jobs=4, objective='binary', random_state=0, n_estimators=100, boosting_type='dart')
    log_loss = -1*cross_val_score(clf, X=X, y=y, scoring='neg_log_loss', cv=kfold)
    avg_log_loss = sum(log_loss) / float(len(log_loss))
    
    return avg_log_loss

In [8]:
def feature_selection(train_X, train_y):
    pca = PCA(whiten=False, svd_solver='randomized', random_state=0)
    pca.fit(train_X)
    
    th_list = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8]
    f_num = [get_feature_num(pca, th) for th in th_list]
    
    print("# of features to test: ",f_num)
    cv_scores = [get_log_loss(fn, train_X, train_y) for fn in f_num]
    
    index_min = np.argmin(cv_scores)
    print("# of features by light gbm: {}, cv score: {:.3f}".format(f_num[index_min], cv_scores[index_min]))
    
    pca = PCA(whiten=False, svd_solver='randomized', random_state=0, n_components=f_num[index_min])
    pca.fit(train_X)
    train_X = get_transformed_df(pca, train_X)
    
    print("New train data X: {}, y: {}".format(train_X.shape, train_y.shape))
    return train_X, train_y, pca

In [9]:
class base_tuner():
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.default_params = {}
        
    def fit_and_update_params(self, params, update=True):
        clf = self.get_clf()

        gs = GridSearchCV(clf, params, scoring='neg_log_loss', cv=kfold)
        gs.fit(self.X, self.y)
        
        cv_df = pd.DataFrame().from_dict(gs.cv_results_)
        cv_df = cv_df[['mean_train_score', 'mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
        cv_df = cv_df.sort_values(by=['rank_test_score', 'std_test_score']).reset_index(drop=True)
        best_params = cv_df.loc[0, 'params']
        
        if update is True:
            self.default_params.update(best_params)
        
        print('Selected hyper-params:', best_params)
        print('==============================> cv score: {:.4f}'.format(cv_df.loc[0, 'mean_test_score']))
        return best_params
    
    def tune(self):
        pass
    
    def get_clf(self):
        return None

In [10]:
class lgbm_tuner(base_tuner):
    def __init__(self, X, y):
        super(lgbm_tuner, self).__init__(X, y)
        self.default_params = {
            'n_jobs': 4,
            'objective': 'binary',
            'random_state': 0,
            'boosting_type': 'dart'
        }
    
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_leaves_num_and_gamma(self):
        params = {
            'num_leaves': [2, 3, 7, 15, 31, 63],
            'min_split_gain': [.0, .1, .2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'colsample_bytree': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
        
    def tune_regularization(self):
        params = {
            'reg_alpha': [1., .8, .6, .4, .2, .0],
            'reg_lambda': [1., .8, .6, .4, .2, .0]
        }
        
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            elif v == 0.:
                next_params[k] = [.0, .05, .1, .15]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('lgb tuner start tuning')
        self.tune_est_num_and_lr()
        self.tune_leaves_num_and_gamma()
        self.tune_sampling()
        self.tune_regularization()
        
        return self.get_clf()
    
    def get_clf(self):
        return lightgbm.LGBMClassifier(**self.default_params)

In [11]:
class xgb_tuner(base_tuner):
    def __init__(self, X, y):
        super(xgb_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'n_jobs': 4,
            'objective': 'binary:logistic',
            'seed': 0,
            'eval_metric': 'logloss'
        }
    
    def tune_booster(self):
        params = {
            'booster': ['dart', 'gbtree']
        }
        self.fit_and_update_params(params)
        
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child_w_and_gamma(self):
        params = {
            'min_child_weight': [1, 2, 4, 6, 8, 10],
            'gamma': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'colsample_bytree': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
        
    def tune_regularization(self):
        params = {
            'reg_alpha': [1., .8, .6, .4, .2, .0],
            'reg_lambda': [1., .8, .6, .4, .2, .0]
        }
        
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            elif v == 0.:
                next_params[k] = [.0, .05, .1, .15]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('xgb tuner start tuning')
        self.tune_booster()
        self.tune_est_num_and_lr()
        self.tune_max_depth()
        self.tune_child_w_and_gamma()
        self.tune_sampling()
        self.tune_regularization()
        
        return self.get_clf()
    
    def get_clf(self):
        return xgboost.XGBClassifier(**self.default_params)

In [12]:
class lr_tuner(base_tuner):
    def __init__(self, X, y):
        super(lr_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'penalty': 'l1',
            'max_iter': 10000
        }

    def tune(self):
        print('logistic regression tuner start tuning')
        params = {
            'solver': ['liblinear', 'saga']
        }
        
        self.fit_and_update_params(params)
        
        return self.get_clf()
    
    def get_clf(self):
        return LogisticRegression(**self.default_params)

In [13]:
class mlp_tuner(base_tuner):
    def __init__(self, X, y):
        super(mlp_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'learning_rate': 'adaptive',
            'learning_rate_init': 0.005,
            'max_iter': 2000,
            'random_state':0
        }

    def tune(self):
        print('mlp tuner start tuning')
        params = {
            'solver':['lbfgs', 'sgd', 'adam'],
            'hidden_layer_sizes': [(100,), (150,), (100, 100,)],
        }
        
        self.fit_and_update_params(params)
        
        params = {
            'alpha': [10., 5., 2., 1., .8, .5, .2, .1],
        }
        
        self.fit_and_update_params(params)
        
        return self.get_clf()
    
    def get_clf(self):
        return MLPClassifier(**self.default_params)

In [14]:
class adb_tuner(base_tuner):
    def __init__(self, X, y):
        super(adb_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'algorithm': 'SAMME.R',
            'random_state':0
        }

    def tune(self):
        print('adaboost tuner start tuning')
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }
        
        self.fit_and_update_params(params)

        return self.get_clf()
    
    def get_clf(self):
        return AdaBoostClassifier(**self.default_params)

In [15]:
class bg_tuner(base_tuner):
    def __init__(self, X, y):
        super(bg_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'random_state':0,
            'bootstrap_features':True
        }
        
    def tune_est_num(self):
        params = {
            'n_estimators': [100, 200, 400, 800]
        }

        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'max_samples': [1., .8, .6, .4, .2],
            'max_features': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('bagging tuner start tuning')
        self.tune_est_num()
        self.tune_sampling()

        return self.get_clf()
    
    def get_clf(self):
        return BaggingClassifier(**self.default_params)

In [16]:
class gb_tuner(base_tuner):
    def __init__(self, X, y):
        super(gb_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'random_state':0
        }
        
    def tune_loss_criterion(self):
        params = {
            'loss': ['deviance', 'exponential'],
            'criterion': ['friedman_mse', 'mse']
        }

        self.fit_and_update_params(params)
        
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child(self):
        params = {
            'min_samples_split': [2, 3, 7, 15,31],
            'min_impurity_decrease': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'max_features': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
      
    def tune(self):
        print('gradient boosting tuner start tuning')
        self.tune_loss_criterion()
        self.tune_est_num_and_lr()
        self.tune_max_depth()
        self.tune_child()
        self.tune_sampling()
        
        return self.get_clf()
    
    def get_clf(self):
        return GradientBoostingClassifier(**self.default_params)

In [17]:
class rf_tuner(base_tuner):
    def __init__(self, X, y):
        super(rf_tuner, self).__init__(X, y)
        self.X = X
        self.y = y
        self.default_params = {
            'random_state':0,
            'n_jobs': 4
        }
        
    def tune_loss_criterion(self):
        params = {
            'class_weight': [None, 'balanced'],
            'criterion': ['gini', 'entropy']
        }

        self.fit_and_update_params(params)
        
    def tune_est_num(self):
        params = {
            'n_estimators': [100, 200, 400, 800]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child(self):
        params = {
            'min_samples_split': [2, 3, 7, 15,31],
            'min_impurity_decrease': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'max_features': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
      
    def tune(self):
        print('random forest tuner start tuning')
        self.tune_loss_criterion()
        self.tune_est_num()
        self.tune_max_depth()
        self.tune_child()
        self.tune_sampling()
        
        return self.get_clf()
    
    def get_clf(self):
        return RandomForestClassifier(**self.default_params)

In [18]:
class et_tuner(rf_tuner):
    def __init__(self, X, y):
        super(et_tuner, self).__init__(X, y)

    def tune(self):
        print('extra tree tuner start tuning')
        self.tune_loss_criterion()
        self.tune_est_num()
        self.tune_max_depth()
        self.tune_child()
        self.tune_sampling()
        
        return self.get_clf()
    
    def get_clf(self):
        return ExtraTreesClassifier(**self.default_params)

In [19]:
from stacking_models_api import StackingAveragedModels
import gc

In [20]:
test_df = pd.read_json('Data/test.json')
print("Test data shape: {}".format(test_df.shape))
test_ids = test_df['id']
del test_df
gc.enable()
gc.collect()

train_X, train_y, select_cols = get_train_df(top_percent=.6)
print(train_y[:10])
train_X, train_y, pca = feature_selection(train_X, train_y)


test_X = get_test_df(select_cols)
test_X = get_transformed_df(pca, test_X)    

tuners = [lr_tuner(train_X, train_y),
          lgbm_tuner(train_X, train_y), 
          xgb_tuner(train_X, train_y)]
clfs = []
for tuner in tuners:
    tuner.tune()
    clfs.append(tuner.get_clf())

for i, clf in enumerate(clfs):
    if isinstance(tuners[i], lgbm_tuner) or isinstance(tuners[i], xgb_tuner):
        if isinstance(tuners[i], lgbm_tuner):
            name = 'lgb'
        else:
            name = 'xgb'

        clf.fit(train_X, train_y)
        predictions = clf.predict_proba(test_X)[:,1]
        print("{} tuner predictions\n".format(name), predictions)

        submission = pd.DataFrame()
        submission['id'] = test_ids
        submission['is_iceberg'] = predictions
        submission.to_csv('Submissions/submission_{}_auto_fine_tune_saak.csv'.format(name), 
                          float_format="%.15f", index=False)

Test data shape: (8424, 4)
(851, 31601) (753, 31601)
Train data X: (1604, 18960), y: (1604,)
[0 0 1 0 0 1 1 0 0 0]


KeyboardInterrupt: 

In [21]:
train = pd.read_json('Data/train.json')
print(train.head(10))

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   
5  [-20.769371, -20.769434, -25.906025, -25.90602...   
6  [-26.673811, -23.666162, -27.622442, -28.31768...   
7  [-24.989119, -27.755224, -25.817074, -24.98927...   
8  [-17.146641, -17.146572, -17.994583, -19.44553...   
9  [-24.020853, -23.551275, -27.18819, -29.126434...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  