In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values
    
    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, shuffle =True, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary, avg_logloss 

In [27]:
start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='log')
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_submission = scaler.transform(X_submission)

[2020-03-02 00:32:49]: Load data 


In [13]:
# Tune parameters
parameters = {
    # 'n_neighbors': [2,4] 
    # 'n_neighbors': [8,16,32]
    'n_neighbors': [112]
    }

n_neighbors = []
mean_test_score = []
for i in range(len(parameters['n_neighbors'])):
          model = KNeighborsClassifier(n_neighbors = parameters['n_neighbors'][i],
                                        n_jobs=-1)
          print("n_neighbors: {}".format(parameters['n_neighbors'][i]))
          train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)
          n_neighbors.append(parameters['n_neighbors'][i])
          mean_test_score.append(avg_logloss)
          end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

n_neighbors: 112
  Fold 0
  logloss: 0.695519
  Fold 1
  logloss: 0.702952
  Fold 2
  logloss: 0.716317
  Fold 3
  logloss: 0.704810
  Fold 4
  logloss: 0.697648
model average logloss: 0.703449
[2020-03-01 22:13:00]: Run complete: 1:28:04 elapsed 


In [0]:
model_4_path = "applied/otto/model_4.csv"

if os.path.exists(model_4_path):
    previous_gridsearch = pd.read_csv(model_4_path, sep=',', header=0)
    for i in range(previous_gridsearch.values.shape[0]):
        n_neighbors.append(previous_gridsearch.values[i][0])
        mean_test_score.append(previous_gridsearch.values[i][1])

In [0]:
submit = pd.DataFrame()

# for i in range(len(classifier.cv_results_['params'])):
#     n_neighbors.append(classifier.cv_results_['params'][i]['n_neighbors'])
#     mean_test_score.append(classifier.cv_results_['mean_test_score'][i])

# print(classifier.cv_results_['params'][i]['n_estimators'])
submit["n_neighbors"] = n_neighbors
submit["mean_test_score"] = mean_test_score
submit.to_csv("applied/otto/model_4.csv", index = False)

In [0]:
best_model = KNeighborsClassifier(n_neighbors = 128 ,n_jobs=-1)

In [30]:
# Train model using best parameters
train_models_pred, test_models_pred, summary, avg_logloss  = model_CV_train(best_model, X, y, X_submission, n_classes=9, n_folds=5)
end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

  Fold 0
  logloss: 0.693752
  Fold 1
  logloss: 0.703128
  Fold 2
  logloss: 0.716615
  Fold 3
  logloss: 0.702740
  Fold 4
  logloss: 0.699925
model average logloss: 0.703232
[2020-03-02 01:45:53]: Run complete: 1:13:04 elapsed 


In [0]:
np.savetxt("applied/otto/model4_train.csv",  train_models_pred , delimiter=",")
np.savetxt("applied/otto/model4_test.csv",  test_models_pred , delimiter=",")