In [1]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

Using TensorFlow backend.


In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values
    
    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, shuffle =True,random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        # print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        # print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary, avg_logloss

In [7]:
start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='log')

[2020-03-01 05:49:53]: Load data 
[2020-03-01 05:49:59]: NumExpr defaulting to 8 threads. 


In [0]:
# Tune parameters

model =  LogisticRegression(penalty='l2', random_state=42)
parameters = {
    # 'C': [5.0, 4.0, 3.0, 2.0, 1.0, 0.1, 0.15, 0.01, 0.015, 0.001]
    'C': np.linspace(3.0, 5.0, num=100)
    # 'C': [5.0]
    }
# classifier=GridSearchCV(estimator=model, param_grid=parameters, scoring='neg_log_loss', verbose=1, n_jobs=-1, cv=5);  

In [30]:
C = []
mean_test_score = []
for i in range(len(parameters['C'])):
          model = LogisticRegression(C=parameters['C'][i], 
                                penalty='l2', 
                                random_state=42
                                )
          print("C: {}".format(parameters['C'][i]))
          train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)
          C.append(parameters['C'][i])
          mean_test_score.append(avg_logloss)
          end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

C: 3.0




model average logloss: 0.676147
C: 3.0202020202020203




model average logloss: 0.676149
C: 3.04040404040404




model average logloss: 0.676147
C: 3.0606060606060606




model average logloss: 0.676149
C: 3.080808080808081




model average logloss: 0.676146
C: 3.101010101010101




model average logloss: 0.676146
C: 3.121212121212121




model average logloss: 0.676147
C: 3.1414141414141414




model average logloss: 0.676145
C: 3.1616161616161618




model average logloss: 0.676145
C: 3.1818181818181817




model average logloss: 0.676145
C: 3.202020202020202




model average logloss: 0.676146
C: 3.2222222222222223




model average logloss: 0.676144
C: 3.242424242424242




model average logloss: 0.676143
C: 3.2626262626262625




model average logloss: 0.676146
C: 3.282828282828283




model average logloss: 0.676143
C: 3.303030303030303




model average logloss: 0.676143
C: 3.323232323232323




model average logloss: 0.676143
C: 3.3434343434343434




model average logloss: 0.676144
C: 3.3636363636363638




model average logloss: 0.676145
C: 3.383838383838384




model average logloss: 0.676142
C: 3.404040404040404




model average logloss: 0.676142
C: 3.4242424242424243




model average logloss: 0.676144
C: 3.4444444444444446




model average logloss: 0.676143
C: 3.4646464646464645




model average logloss: 0.676143
C: 3.484848484848485




model average logloss: 0.676141
C: 3.505050505050505




model average logloss: 0.676143
C: 3.525252525252525




model average logloss: 0.676142
C: 3.5454545454545454




model average logloss: 0.676142
C: 3.5656565656565657




model average logloss: 0.676143
C: 3.5858585858585856




model average logloss: 0.676140
C: 3.606060606060606




model average logloss: 0.676141
C: 3.6262626262626263




model average logloss: 0.676143
C: 3.6464646464646466




model average logloss: 0.676140
C: 3.666666666666667




model average logloss: 0.676140
C: 3.686868686868687




model average logloss: 0.676141
C: 3.707070707070707




model average logloss: 0.676140
C: 3.7272727272727275




model average logloss: 0.676140
C: 3.7474747474747474




model average logloss: 0.676140
C: 3.7676767676767677




model average logloss: 0.676139
C: 3.787878787878788




model average logloss: 0.676138
C: 3.808080808080808




model average logloss: 0.676138
C: 3.8282828282828283




model average logloss: 0.676139
C: 3.8484848484848486




model average logloss: 0.676138
C: 3.8686868686868685




model average logloss: 0.676138
C: 3.888888888888889




model average logloss: 0.676139
C: 3.909090909090909




model average logloss: 0.676139
C: 3.9292929292929295




model average logloss: 0.676138
C: 3.94949494949495




model average logloss: 0.676137
C: 3.9696969696969697




model average logloss: 0.676138
C: 3.98989898989899




model average logloss: 0.676138
C: 4.01010101010101




model average logloss: 0.676137
C: 4.03030303030303




model average logloss: 0.676139
C: 4.05050505050505




model average logloss: 0.676137
C: 4.070707070707071




model average logloss: 0.676137
C: 4.090909090909091




model average logloss: 0.676138
C: 4.111111111111111




model average logloss: 0.676137
C: 4.1313131313131315




model average logloss: 0.676137
C: 4.151515151515151




model average logloss: 0.676138
C: 4.171717171717171




model average logloss: 0.676138
C: 4.191919191919192




model average logloss: 0.676136
C: 4.212121212121212




model average logloss: 0.676138
C: 4.232323232323233




model average logloss: 0.676136
C: 4.252525252525253




model average logloss: 0.676136
C: 4.272727272727273




model average logloss: 0.676136
C: 4.292929292929293




model average logloss: 0.676138
C: 4.313131313131313




model average logloss: 0.676136
C: 4.333333333333334




model average logloss: 0.676138
C: 4.353535353535354




model average logloss: 0.676136
C: 4.373737373737374




model average logloss: 0.676136
C: 4.3939393939393945




model average logloss: 0.676136
C: 4.414141414141414




model average logloss: 0.676136
C: 4.434343434343434




model average logloss: 0.676139
C: 4.454545454545455




model average logloss: 0.676136
C: 4.474747474747475




model average logloss: 0.676137
C: 4.494949494949495




model average logloss: 0.676136
C: 4.515151515151516




model average logloss: 0.676136
C: 4.5353535353535355




model average logloss: 0.676136
C: 4.555555555555555




model average logloss: 0.676136
C: 4.575757575757576




model average logloss: 0.676136
C: 4.595959595959596




model average logloss: 0.676136
C: 4.616161616161616




model average logloss: 0.676136
C: 4.636363636363637




model average logloss: 0.676138
C: 4.656565656565657




model average logloss: 0.676136
C: 4.6767676767676765




model average logloss: 0.676136
C: 4.696969696969697




model average logloss: 0.676136
C: 4.717171717171717




model average logloss: 0.676136
C: 4.737373737373737




model average logloss: 0.676136
C: 4.757575757575758




model average logloss: 0.676136
C: 4.777777777777778




model average logloss: 0.676137
C: 4.7979797979797985




model average logloss: 0.676137
C: 4.818181818181818




model average logloss: 0.676137
C: 4.838383838383838




model average logloss: 0.676137
C: 4.858585858585859




model average logloss: 0.676137
C: 4.878787878787879




model average logloss: 0.676137
C: 4.8989898989899




model average logloss: 0.676137
C: 4.91919191919192




model average logloss: 0.676137
C: 4.9393939393939394




model average logloss: 0.676136
C: 4.95959595959596




model average logloss: 0.676136
C: 4.97979797979798




model average logloss: 0.676137
C: 5.0




model average logloss: 0.676138
[2020-03-01 08:54:50]: Run complete: 3:04:57 elapsed 


In [0]:
# classifier.fit(X, y);

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [0]:
# print(classifier.cv_results_.keys())
# print('best_params: ',classifier.best_params_)
# print('best_score: ',classifier.best_score_)
# for i in range(len(classifier.cv_results_['params'])):
#   print('{}, {}'.format(classifier.cv_results_['params'][i], classifier.cv_results_['mean_test_score'][i]))

In [0]:
model_2_path = "applied/otto/model_2.csv"
if os.path.exists(model_2_path):
    previous_gridsearch = pd.read_csv(model_2_path, sep=',', header=0)
    for i in range(previous_gridsearch.values.shape[0]):
        C.append(previous_gridsearch.values[i][0])
        mean_test_score.append(previous_gridsearch.values[i][1])

In [0]:
submit = pd.DataFrame()

# for i in range(len(classifier.cv_results_['params'])):
#     C.append(classifier.cv_results_['params'][i]['C'])
#     mean_test_score.append(classifier.cv_results_['mean_test_score'][i])
  # print(classifier.cv_results_['params'][i]['n_estimators'])
submit["C"] = C
submit["mean_test_score"] = mean_test_score
submit.to_csv("applied/otto/model_2.csv", index = False)

In [0]:
best_model = LogisticRegression(penalty='l2', 
                                C = 4.7,
                                random_state=42,
                                n_jobs = -1 ) 

In [35]:
# Train model using best parameters
train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(best_model, X, y, X_submission, n_classes=9, n_folds=5)
end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


model average logloss: 0.676136
[2020-03-01 14:21:08]: Run complete: 8:31:15 elapsed 


In [0]:
np.savetxt("applied/otto/model2_train.csv",  train_models_pred , delimiter=",")
np.savetxt("applied/otto/model2_test.csv",  test_models_pred , delimiter=",")