In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values
    
    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, shuffle =True, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary, avg_logloss 

In [15]:
start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='None')

[2020-03-02 20:08:48]: Load data 


## Model 24 to 33

In [16]:
# Tune parameters
parameters = {
    'n_neighbors': [2,4,8,16,32,64,128,256,512,1024]
    }

n_neighbors = []
mean_test_score = []
for i in range(len(parameters['n_neighbors'])):
          model = KNeighborsClassifier(n_neighbors = parameters['n_neighbors'][i],
                                        n_jobs=-1)
          print("n_neighbors: {}".format(parameters['n_neighbors'][i]))
          train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)
          n_neighbors.append(parameters['n_neighbors'][i])
          mean_test_score.append(avg_logloss)
          np.savetxt("applied/otto/train_models_pred/model" + str(i+24) + "_train.csv",  train_models_pred , delimiter=",")
          np.savetxt("applied/otto/test_models_pred/model" + str(i+24) + "_test.csv",  test_models_pred , delimiter=",")
          end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

n_neighbors: 2
  Fold 0
  logloss: 4.936042
  Fold 1
  logloss: 5.080766
  Fold 2
  logloss: 4.969536
  Fold 3
  logloss: 5.017054
  Fold 4
  logloss: 5.020071
model average logloss: 5.004694
[2020-03-02 20:29:42]: Run complete: 0:20:54 elapsed 


In [0]:
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='None')

In [0]:
Model_22_X = X + np.squeeze(np.array([X==0]), axis=0)

In [70]:
model = KNeighborsClassifier(n_neighbors = 1,n_jobs=-1)
train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, Model_22_X, y, X_submission, n_classes=9, n_folds=5)

  Fold 0
  logloss: 9.093704
  Fold 1
  logloss: 9.242339
  Fold 2
  logloss: 9.245876
  Fold 3
  logloss: 9.335182
  Fold 4
  logloss: 9.213318
model average logloss: 9.226084


In [0]:
np.savetxt("applied/otto/train_models_pred/model22_train.csv",  train_models_pred , delimiter=",")
np.savetxt("applied/otto/test_models_pred/model22_test.csv",  test_models_pred , delimiter=",")

In [0]:
X_log, y, X_submission = process_data(train_df, test_df, transform='log')

In [0]:
Model_23_X = X + np.squeeze(np.array([X==0]), axis=0) + X_log

In [74]:
model = KNeighborsClassifier(n_neighbors = 1,n_jobs=-1)
train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, Model_23_X, y, X_submission, n_classes=9, n_folds=5)

  Fold 0
  logloss: 8.426814
  Fold 1
  logloss: 8.514002
  Fold 2
  logloss: 8.330498
  Fold 3
  logloss: 8.517481
  Fold 4
  logloss: 8.347825
model average logloss: 8.427324


In [0]:
np.savetxt("applied/otto/train_models_pred/model23_train.csv",  train_models_pred , delimiter=",")
np.savetxt("applied/otto/test_models_pred/model23_test.csv",  test_models_pred , delimiter=",")