In [2]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path
import lightgbm as lgb


from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

Using TensorFlow backend.


In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values

    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, shuffle =True, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        # print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred
        break

    # avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary, avg_logloss

In [21]:
start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='log')
scaler = StandardScaler()
scaler.fit(X)
X_train_scaled = scaler.transform(X)
X_submission_scaled = scaler.transform(X_submission)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

[2020-03-03 22:04:42]: Load data 


In [0]:
# n_estimators=800, min_data_in_leaf=20, min_child_samples=10, num_leaves=60: 0.496482
# n_estimators=800, min_data_in_leaf=20, min_child_samples=20, num_leaves=60: 0.496482
# n_estimators=800, min_data_in_leaf=20, min_child_samples=20, num_leaves=100: 0.478154
# n_estimators=1000, min_data_in_leaf=20, min_child_samples=20, num_leaves=100: 0.464031
# n_estimators=1200, min_data_in_leaf=20, min_child_samples=20, num_leaves=100: 0.456716
# n_estimators=1400, min_data_in_leaf=20, min_child_samples=20, num_leaves=100: 0.453015
# n_estimators=1600, min_data_in_leaf=20, min_child_samples=20, num_leaves=100: 0.451256
# n_estimators=1600, min_data_in_leaf=20, min_child_samples=20, num_leaves=200: 0.452092
# n_estimators=2000, min_data_in_leaf=20, min_child_samples=20, num_leaves=200: 0.451392
# n_estimators=1700, min_data_in_leaf=20, min_child_samples=20, num_leaves=200: 0.450898

In [0]:
# Tune parameters
parameters = {
    'n_estimators': [1700], 
    'num_leaves': [100]
    }

In [30]:

# Train model using best parameters
n_estimators= []
num_leaves=[]
mean_test_score = []
for i in range(len(parameters['n_estimators'])):
    for k in range(len(parameters['num_leaves'])):
          model = lgb.LGBMClassifier(objective='multiclass', 
                          n_jobs=-1, 
                          is_unbalance=True, 
                          num_threads=8, 
                          two_round=True,
                          bagging_fraction=0.9,
                          bagging_freq=1,
                          boosting_type='gbdt',
                          feature_fraction=0.9,
                          learning_rate=0.01,
                          min_child_samples=10,
                          min_child_weight=5,
                          min_data_in_leaf=20,
                          min_split_gain=0.0,
                          n_estimators=parameters['n_estimators'][i],
                          num_leaves=parameters['num_leaves'][k],
                          reg_alpha=0.0,
                          reg_lambda=0.0,
                          subsample=1.0)
          print("n_estimators: {}, num_leaves: {}".format(parameters['n_estimators'][i], parameters['num_leaves'][k]))
          train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, X_train_scaled, y, X_submission_scaled, n_classes=9, n_folds=5)
          n_estimators.append(parameters['n_estimators'][i])
          num_leaves.append(parameters['num_leaves'][k])
          mean_test_score.append(avg_logloss)
          end_time = time.time()
# logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

n_estimators: 1700, num_leaves: 100
  logloss: 0.450897
model average logloss: 0.450897


In [31]:
print(train_models_pred.shape)
print( test_models_pred.shape)

(61878, 9)
(144368, 9)


In [0]:
test = pd.read_csv("applied/otto/test.csv") 
test_ID = test['id']
submit = pd.DataFrame()
submit["id"] = test_ID
for i in range(9):
  submit["Class_" + str(i+1)] = test_models_pred[:,i]
submit.to_csv("applied/otto/modelLGBM_test.csv", index = False)