In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path



from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

Using TensorFlow backend.


In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values

    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, shuffle =True, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        # print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        # print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary, avg_logloss

In [1]:

start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='log')

NameError: ignored

In [0]:
# Tune parameters
parameters = {
    # 'alpha': [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0 , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6] 
    # 'alpha': [1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5] 
    # 'alpha': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] 
    # 'alpha': [20,30,40,50,60,70,80,90] 
    # 'alpha': [100,200,300,400,500,600,700,800,900] 
    # 'alpha': [225,250,275,325,350,375] 
    'alpha': range(300,350,1)
    }

In [0]:
# Train model using best parameters
alpha = []
mean_test_score = []
for i in range(len(parameters['alpha'])):
          model = MultinomialNB(alpha=parameters['alpha'][i], 
                                fit_prior=True, 
                                class_prior=None)
          print("alpha: {}".format(parameters['alpha'][i]))
          train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)
          alpha.append(parameters['alpha'][i])
          mean_test_score.append(avg_logloss)
          end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

alpha: 300
model average logloss: 1.208887
alpha: 301
model average logloss: 1.208862
alpha: 302
model average logloss: 1.208837
alpha: 303
model average logloss: 1.208814
alpha: 304
model average logloss: 1.208792
alpha: 305
model average logloss: 1.208770
alpha: 306
model average logloss: 1.208750
alpha: 307
model average logloss: 1.208731
alpha: 308
model average logloss: 1.208713
alpha: 309
model average logloss: 1.208695
alpha: 310
model average logloss: 1.208679
alpha: 311
model average logloss: 1.208664
alpha: 312
model average logloss: 1.208649
alpha: 313
model average logloss: 1.208636
alpha: 314
model average logloss: 1.208623
alpha: 315
model average logloss: 1.208612
alpha: 316
model average logloss: 1.208601
alpha: 317
model average logloss: 1.208592
alpha: 318
model average logloss: 1.208583
alpha: 319
model average logloss: 1.208575
alpha: 320
model average logloss: 1.208568
alpha: 321
model average logloss: 1.208563
alpha: 322
model average logloss: 1.208558
alpha: 323


In [0]:
model_7_path = "applied/otto/model_7.csv"

if os.path.exists(model_7_path):
    previous_gridsearch = pd.read_csv(model_7_path, sep=',', header=0)
    for i in range(previous_gridsearch.values.shape[0]):
        alpha.append(previous_gridsearch.values[i][0])
        mean_test_score.append(previous_gridsearch.values[i][1])

In [0]:
submit = pd.DataFrame()
submit["alpha"] = alpha
submit["mean_test_score"] = mean_test_score
submit.to_csv(model_7_path, index = False)

In [0]:
best_model = MultinomialNB(alpha= 327, fit_prior=True, class_prior=None)

In [0]:
# Train model using best parameters
train_models_pred, test_models_pred, summary, avg_logloss = model_CV_train(best_model, X, y, X_submission, n_classes=9, n_folds=5)
end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

model average logloss: 1.208547
[2020-02-28 23:57:06]: Run complete: 0:19:09 elapsed 


In [0]:
print(train_models_pred.shape)
print( test_models_pred.shape)

(61878, 9)
(144368, 9)


In [0]:
np.savetxt("applied/otto/model7_train.csv",  train_models_pred , delimiter=",")
np.savetxt("applied/otto/model7_test.csv",  test_models_pred , delimiter=",")