In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gc
import os
import pickle
import warnings
from time import time

warnings.filterwarnings('ignore')


In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.decomposition import NMF

from scipy.optimize import linear_sum_assignment
from pystacknet.pystacknet import StackNetClassifier



In [None]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from bayes_opt import BayesianOptimization


In [None]:
# for convenient
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise, Lambda
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers import MaxPooling2D, merge
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras import losses
from keras.utils import to_categorical
from keras.regularizers import L1L2
import keras
import keras.backend as K


In [None]:
def acc(ypred, y):
#     assert len(y) > 0
#     assert len(np.unique(ypred)) == len(np.unique(y))
    
    s = np.unique(ypred)
    t = np.unique(y)
    
    N = len(np.unique(y))
    C = np.zeros((N, N), dtype = np.int32)
    for i in range(N):
        for j in range(N):
            idx = np.logical_and(ypred == s[i], y == t[j])
            C[i][j] = np.count_nonzero(idx)
    
    # convert the C matrix to the 'true' cost
    Cmax = np.amax(C)
    C = Cmax - C
    # 
    indices = np.array(list(map(lambda x: list(x), list(zip(*linear_sum_assignment(C))))))
    row = indices[:][:, 0]
    col = indices[:][:, 1]
    # calculating the accuracy according to the optimal assignment
    count = 0
    for i in range(N):
        idx = np.logical_and(ypred == s[row[i]], y == t[col[i]] )
        count += np.count_nonzero(idx)
    
    return 1.0*count/len(y)

def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]

    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [None]:
gc.collect()

In [None]:
category = ['psfMag', 'fiberMag', 'petroMag', 'modelMag', '_u', '_g', '_r', '_i', '_z']

In [None]:
def exp(a):
    if a < 0:
        return -np.log(-a)
    else:
        return np.log(a)
    
def exp(a):
    if a < 0:
        return -(a ** 2)
    else:
        return (a ** 2)
    
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv', index_col=0)

column_number = {}
for i, column in enumerate(sub.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

tr['type_num'] = tr['type'].apply(lambda x: to_number(x, column_number))

target = tr['type_num']
t = target.copy()

train_X = tr.drop(['id', 'type', 'type_num'], axis=1)
test_X = te.drop(['id',], axis=1)

train_X['t'] = np.ones(len(train_X))
test_X['t'] = np.zeros(len(test_X))
m = pd.concat([train_X, test_X])

ctd = []
for c in m.columns[1:-1]:
    mini = np.min(te[c])
    maxi = np.max(te[c])
    ctd += (list(m[c][m[c].map(lambda x: x if mini < x < maxi else 'c') == 'c'].index.values))

for c in category:
    m[c] = np.zeros(len(m))
    for cl in m.columns:
        if c in cl:
            m[c] += m[cl]
            
            
new_col = ['fiberID', 'psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z',
       'fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z',
       'petroMag_u', 'petroMag_g', 'petroMag_r', 'petroMag_i', 'petroMag_z',
       'modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z',
       'psfMag', 'fiberMag', 'petroMag', 'modelMag','_u', '_g', '_r', '_i', '_z', 't']

m = m[new_col]
tr = m[m['t'] == 1].drop(['t'], axis=1)
te = m[m['t'] == 0].drop(['t'], axis=1)

ctd = np.array(list(set(ctd)))
tr = tr.drop(ctd)
tr = tr.reset_index().drop('index', axis=1)

tr2 = tr.copy()
te2 = te.copy()

for c in tr.columns[1:]:
    tr[c] = tr[c].map(exp)
    te[c] = te[c].map(exp)
#     trf = RobustScaler().fit(tr[c].values.reshape(-1, 1))
#     tr[c] = trf.transform(tr[c].values.reshape(-1, 1))
#     te[c] = trf.transform(te[c].values.reshape(-1, 1))

    tr[c] = (tr[c] - np.mean(tr[c]))/np.std(tr[c])
    te[c] = (te[c] - np.mean(tr[c]))/np.std(tr[c])
    tr2[c] = (tr2[c] - np.min(tr2[c]))/(np.max(tr2[c]) - np.min(tr2[c])) # for nmf
    te2[c] = (te2[c] - np.min(tr2[c]))/(np.max(tr2[c]) - np.min(tr2[c]))

m = pd.concat([tr, te])
fiber = pd.get_dummies(m['fiberID'], prefix='fiber')
tr_fiber = fiber.iloc[0:len(tr), :]
te_fiber = fiber.iloc[len(tr): , :]

# train_X = train_X.drop('fiberID', axis=1)
# test_X = test_X.drop('fiberID', axis=1)

t = t.drop(ctd)
t = t.reset_index().drop('index', axis=1)

target = t.copy()
target = target.values.flatten()
target_wide = to_categorical(target)

tr_X = tr.copy()
te_X = te.copy()

In [None]:
ttr = tr.drop('fiberID', axis=1)
tte = te.drop('fiberID', axis=1)

pca = PCA(15, random_state=42).fit(ttr)
pca_tr = pca.transform(ttr)
pca_te = pca.transform(tte)
print('complete pca')

# nmf = NMF(15, random_state=42).fit(tr2)
# nmf_tr = nmf.transform(tr2)
# nmf_te = nmf.transform(te2)
# print('complete nmf')

tr = np.concatenate([tr.values, pca_tr], axis=1)
te = np.concatenate([te.values, pca_te], axis=1)

km = KMeans(19, random_state=42).fit(ttr)
km_tr1 = km.predict(ttr)
km_tr2 = to_categorical(km_tr1)
km_te1 = km.predict(tte)
km_te2 = to_categorical(km_te1)
print('complete kmeans')

gm = GaussianMixture(19, random_state=42).fit(ttr)
gm_tr1 = gm.predict(ttr)
gm_tr2 = to_categorical(gm_tr1)
gm_te1 = gm.predict(tte)
gm_te2 = to_categorical(gm_te1)
print('complete ggm')

tr_X = np.concatenate([tr, km_tr1.reshape(-1, 1), gm_tr1.reshape(-1, 1)], axis=1)
te_X = np.concatenate([te, km_te1.reshape(-1, 1), gm_te1.reshape(-1, 1)], axis=1)

In [None]:
te_X[0]

In [None]:
lr = LogisticRegression(random_state=42)
svc = SVC(random_state=42, probability=True)
knn = KNeighborsClassifier(n_neighbors=1)
rf = RandomForestClassifier(max_depth=9,
                           random_state=42)
models = [knn]
for m in models:
    s = time()
    print(np.mean(cross_val_score(m, tr_X, t,  scoring='neg_log_loss', cv = 4 )))
    print(time() - s)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(tr_X, t, test_size=0.3, random_state=12, shuffle=True)



In [None]:
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    

    params = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
#               'learning_rate' : 0.03,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'softmax',
              'save_binary': True,
              'seed': 12,
              'feature_fraction_seed': 12,
              'bagging_seed': 12,
              'drop_seed': 12,
              'data_random_seed': 12,
              'boosting': 'gbdt', ## some get better result using 'dart'
              'verbose': 1,
              'is_unbalance': True,
              'boost_from_average': True,
              'metric':'multi_logloss'}    
    
    ## set clf options
    clf = lgb.LGBMClassifier(**params).fit(train_X, train_y, early_stopping_rounds=50,eval_set=[(test_X, test_y)], eval_metric='multi_logloss', verbose=0)
    
    score = -log_loss(test_y, clf.predict_proba(test_X))

    return score


In [None]:
bounds_LGB = {
    'num_leaves': (300, 1000), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 0.3),
    'min_child_weight': (0.01, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(6, 25),
}

In [None]:
optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)


In [None]:
init_points = 10
n_iter = 20

optimizer.maximize(init_points=init_points, n_iter=n_iter)


In [None]:
param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        #'learning_rate': LGB_BO.max['params']['learning_rate'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'softmax',
        'save_binary': True,
        'seed': 12,
        'feature_fraction_seed': 12,
        'bagging_seed': 12,
        'drop_seed': 12,
        'data_random_seed': 12,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'is_unbalance': False,
        'boost_from_average': True,
        'metric':'multi_logloss'
    }

params = param_lgb.copy()

In [None]:
f = open('best_params_robust.bin', 'wb')
pickle.dump(params, f)
f.close()


In [None]:
f = open('best_params.bin', 'rb')
params = pickle.load(f)
f.close()


In [None]:
%%time
lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 50)
lgb_clf.fit(tr_X, t)

In [None]:
print('score is',np.mean(cross_val_score(lgb_clf, tr_X, t,  scoring='neg_log_loss', cv = 4 )))

In [None]:
y_pred = lgb_clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/pre/robust_new_lgb2.csv', index=True)


In [None]:
'############################'

In [None]:
# parmas for xgboost
params_fx = {'min_data_in_leaf': params['min_data_in_leaf'],
             'num_leaves': params['num_leaves'],
             'min_child_weight': params['min_child_weight'],
             'bagging_fraction': params['bagging_fraction'],
             'feature_fraction': params['feature_fraction'],
             'reg_lambda': params['reg_lambda'],
             'reg_alpha': params['reg_alpha'],
             'max_depth': params['max_depth'],
}

In [None]:
%%time
xgb_clf = xgb.XGBClassifier(
            **params_fx,
#             n_estimators=500,
            tree_method = 'hist',
            booster = 'gbtree',
            eval_metric = 'mlogloss',
            objective = 'multi:softprob',
            num_class = 19,
            early_stoppong_rounds = 50
            
    ).fit(tr_X, t)

In [None]:
y_pred = xgb_clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/pre/xgb7.csv', index=True)


In [None]:
################

In [None]:
%%time
cat_clf = cat.CatBoostClassifier(early_stopping_rounds=50, random_state=42, verbose=0)

In [None]:
y_pred = cat_clf.predict_proba(te_X)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/pre/cat2.csv', index=True)


In [None]:
# lgb_clf = lgb.LGBMClassifier(**params, early_stoppong_rounds = 50)

xgb_clf = xgb.XGBClassifier(
            **params_fx,
#             n_estimators=500,
            tree_method = 'hist',
            booster = 'gbtree',
            eval_metric = 'mlogloss',
            objective = 'multi:softprob',
            num_class = 19,
            early_stoppong_rounds = 50
            
    )

rf1 = RandomForestClassifier(n_estimators=200,
                                max_depth=13, 
                                max_features='sqrt', 
                                random_state=42)

rf2 = RandomForestClassifier(n_estimators=150,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=42)

rf = RandomForestClassifier(max_depth=9, 
                             max_features='sqrt', 
                             random_state=42)

pca = PCA(15)

estimators = [('lgb', lgb_clf), ('xgb', xgb_clf), ('rf', rf)]
vclf = VotingClassifier(estimators,
                       voting='soft',
                       weights = [0.6, 0.3, 0.1],
                       n_jobs=-1)

In [None]:
vclf.fit(tr_X, t)

In [None]:
y_pred = vclf.predict_proba(te_X)
# print(y_pred)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/pre/vclf4.csv', index=True)


In [None]:
accuracy_score(t, res)

In [None]:
models = [[lgb_clf, xgb_clf], 
          [rf2]]

In [None]:
model = StackNetClassifier(models, 
                           metric="logloss", 
                           folds=4,
                           restacking=False,
                           use_retraining=True,
                           use_proba=True, # To use predict_proba after training
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(tr_X, t)

In [None]:
y_pred = model.predict_proba(te_X)
# print(y_pred)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
print(log_loss(t, y_pred))
submission.to_csv('./sub/pre/stk15.csv', index=True)


In [None]:
md = [lgb_clf, xgb_clf, cat_clf, model]
for m in md:
    s = time()
    print(np.mean(cross_val_score(m, tr_X, t,  scoring='neg_log_loss', cv = 4 )))
    print(time() - s)

In [None]:
# stacking using boosting and then NN

In [None]:
k1 = model.predict_up_to(tr_X)
k2 = model.predict_up_to(te_X)

In [None]:
new_tr_X = k1[0]
new_te_X = k2[0]
print(new_tr_X.shape) 

In [None]:
f = open('new_te_X.bin', 'wb')
pickle.dump(new_te_X, f)
f.close()


In [None]:
es = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
inputs1 = Input(shape = (new_tr_X.shape[1],))

x = Dense(128)(inputs1)
x = LeakyReLU(0.2)(x)
x = Dense(64)(x)
x = LeakyReLU(0.2)(x)
x = Dense(64)(x)
x = LeakyReLU(0.2)(x)

outputs1 = Dense(new_tr_X.shape[1])(x)
outputs2 = Dense(19, activation='softmax')(x)

q = Model(inputs1, [outputs1, outputs2])

q.compile(optimizer = 'adam', loss=['mse', 'categorical_crossentropy'])

In [None]:
q.fit(new_tr_X, [new_tr_X,target_wide], batch_size= 1024*16, epochs=200, shuffle=True, callbacks=[es])

In [None]:
q.predict(new_te_X)[1][0]

In [None]:
metrics.log_loss(target_wide, q.predict(new_tr_X)[1])

In [None]:
y_pred = q.predict(new_te_X)[1]
# print(y_pred)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/pre/stk_nn.csv', index=True)


In [None]:
# crossentropy_AE
ypred = KMeans(random_state=42, n_clusters=19).fit_predict(q.predict(new_tr_X)[1])
print(  metrics.normalized_mutual_info_score(target, ypred),
        metrics.adjusted_mutual_info_score(target, ypred),
        metrics.adjusted_rand_score(target, ypred),
        acc(target, ypred)
     )

In [None]:
# DEC
ypred = q.predict(new_te_X)[1]
tsne = TSNE(random_state=42, perplexity=100).fit_transform(ypred)
labels = KMeans(random_state=42, n_clusters=19).fit_predict(ypred)
xs = tsne[:,0]
ys = tsne[:,1]
plt.scatter(xs,ys,c=labels)
plt.show()
sns.scatterplot(xs,ys,labels)

In [None]:
def make_AE():
    K.clear_session()
    inputs1 = layers.Input(shape=(tr_X.shape[1], ))
    inputs2 = layers.Input(shape=(target_wide.shape[1], ))
#     x1 = layers.Dense(64)(inputs1)
    x2 = layers.Dense(64)(inputs2)
    
    x = layers.Concatenate()([inputs1, x2])
    
    x = layers.Dense(32)(x)
    x = layers.advanced_activations.LeakyReLU(0.3)(x)

    cl = layers.Dense(19)(x)

    x = layers.Dense(32)(cl)
    x = layers.advanced_activations.LeakyReLU(0.3)(x)

    x = layers.Dense(64)(x)
    x = layers.advanced_activations.LeakyReLU(0.3)(x)

    outputs1 = layers.Dense(tr_X.shape[1])(x)
    outputs2 = layers.Dense(target_wide.shape[1], activation='softmax')(x)

    m = models.Model([inputs1, inputs2], [outputs1, outputs2])
    cl = models.Model([inputs1, inputs2], cl)
    return m, cl

In [None]:
m, cl = make_AE()

m.compile(loss=['mse', 'categorical_crossentropy'], optimizer=optimizers.Adam(2e-4,0.5))


In [None]:
history = m.fit([tr_X.values, target_wide], [tr_X.values, target_wide],
     epochs=100,
     batch_size=1024*128
     )

In [None]:
# crossentropy_AE
km = KMeans(random_state=42, n_clusters=19).fit(cl.predict([tr_X.values, target_wide]))
ypred = km.predict(cl.predict([tr_X.values, target_wide]))
print(  metrics.normalized_mutual_info_score(target, ypred),
        metrics.adjusted_mutual_info_score(target, ypred),
        metrics.adjusted_rand_score(target, ypred),
        acc(target, ypred)
     )

In [None]:
tr_pred_wide = to_categorical(ypred)

te_pred = km.predict()
tr_pred_wide = to_categorical(ypred)

In [None]:
inputs_shape = [tr_X.shape, tr_pred_wide.shape, tr_fiber.shape]

inputs1 = layers.Input(shape = (inputs_shape[0][1], ))
inputs2 = layers.Input(shape = (inputs_shape[1][1], ))
inputs3 = layers.Input(shape = (inputs_shape[2][1], ))

######### 여기부터는 test 해봐야 함
x2 = layers.Dense(64)(inputs2)
x3 = layers.Dense(64)(inputs3)

x = layers.Concatenate()([inputs1, x2, x3])

x = layers.Dense(64)(x)
x = layers.Dense(32)(x)

outputs = layers.Dense(19, activation='softmax')(x)

nn = models.Model([inputs1, inputs2], outputs)

nn.compile(optimizer = 'adam',
          loss = 'sparse_categorical_crossentropy',
          metrics = ['acc'])

In [None]:
K.clear_session()
inputs_shape = [train_X.shape, tr_fiber.shape]

inputs1 = layers.Input(shape = (inputs_shape[0][1], ))
inputs2 = layers.Input(shape = (inputs_shape[1][1], ))

x2 = layers.Dense(64)(inputs2)

x = layers.Concatenate()([inputs1, x2])

x = layers.Dense(64)(x)
x = layers.Dense(32)(x)

outputs = layers.Dense(19, activation='softmax')(x)

nn = models.Model([inputs1, inputs2], outputs)

nn.compile(optimizer = 'adam',
          loss = 'sparse_categorical_crossentropy',
          metrics = ['acc'])

In [None]:
nn.fit([train_X.values, tr_fiber.values], t.values,
      epochs=500,
      validation_split=0.15,
      callbacks=[es],
      batch_size=1024*16
      )

In [None]:
te_X

In [None]:
nn.predict([test_X, te_fiber])[0]

In [None]:
k2[1][0]

In [None]:
y_pred = nn.predict([test_X, te_fiber])
# print(y_pred)
submission = pd.DataFrame(data=y_pred, columns=sub.columns, index=sub.index)
submission.to_csv('./sub/pre/adv_nn1.csv', index=True)


In [None]:
# feature imps

In [None]:
lgb_imp = lgb_clf.feature_importances_

lgb_imp_idx = []
# cols = tr_X.columns
for i, imp in enumerate(lgb_imp):
    if imp > 0:
        lgb_imp_idx.append(i)
    else:
        print(i)
        
# new_cols_imp = cols[lgb_imp_idx]
# new_cols_imp = list(new_cols_imp)
# print(lgb_imp_idx)

In [None]:
print(len(tr_X.columns), len(new_cols_imp))

In [None]:
f = open('./new_cols.bin', 'wb')
pickle.dump(new_cols_imp, f)
f.close()

In [None]:
lgb_clf.feature_importances_

In [None]:
tr_X[0]