In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

import tensorflow
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2

In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)


def standarize_feature(train_df, test_df, cols):
    scaler = StandardScaler()
    for col in cols:
        train_df[col] = scaler.fit_transform(train_df[col].reshape(-1,1).astype(np.float32))
        test_df[col] = scaler.transform(test_df[col].reshape(-1,1).astype(np.float32))
    return None


def extend_bounds(bins):
    bins[0] = bins[0] - 1
    bins[-1] = bins[-1] + 1

In [0]:
def load_data(train_data_path='UCL course/Applied ML/otto/input_data/train.csv', test_data_path = 'UCL course/Applied ML/otto/input_data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, transform=None):
    numerical_features = train_df.columns
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values

    if standarization:
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)

    if transform == 'log':
        X = np.log(X+1)
        X_submission = np.log(X_submission+1)
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def models_CV_train(models, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds,shuffle=True, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes, len(models)))
    stack_test = np.zeros((X_submission.shape[0], n_classes, len(models)))
    
    for i, model in enumerate(models):
        print ("Model %d:" % i, model)
        
        avg_logloss = 0
        
        stack_test_model_i = np.zeros((X_submission.shape[0], n_classes, len(skf)))
        for j, (train_idx, test_idx) in enumerate(skf):
            print ("  Fold %d" % j)
            X_train = X[train_idx]
            y_train = y[train_idx]
            X_test = X[test_idx]
            y_test = y[test_idx]

            model.fit(X_train, y_train)
            
            y_test_pred = model.predict_proba(X_test)          
            stack_train[test_idx, :, i] = y_test_pred
            
            logloss = evaluate(y_test, y_test_pred)
            avg_logloss += logloss
            print ("  logloss: %f" % logloss)
            
            y_submission_pred = model.predict_proba(X_submission)           
            stack_test_model_i[:, :, j] = y_submission_pred
        
        avg_logloss = avg_logloss / n_folds
        print ("model average logloss: %f" % avg_logloss)
        summary[i] = avg_logloss
        
        stack_test[:, :, i] = stack_test_model_i.mean(axis=2)

    return np.swapaxes(stack_train, 1, 2).reshape((X.shape[0], -1)), np.swapaxes(stack_test, 1, 2).reshape((X_submission.shape[0], -1)), summary


In [0]:
def create_2_layer_keras_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dropout(0.05, input_shape=(input_dim,)))
    model.add(Dense(4096, use_bias=True,bias_initializer='glorot_normal', kernel_initializer='glorot_normal', activation='relu',bias_regularizer=l1_l2(l1=1e-5, l2=1e-5), kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    model.add(Dropout(0.05))
    model.add(Dense(526,use_bias=True,bias_initializer='glorot_normal',bias_regularizer=l1_l2(l1=1e-5, l2=1e-5), kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    model.add(Dropout(0.05))
    model.add(Dense(output_dim,use_bias=True,bias_initializer='glorot_normal',bias_regularizer=l1_l2(l1=1e-5, l2=1e-5), kernel_initializer='glorot_normal', activation='softmax', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'categorical_crossentropy'])
    return model


def create_3_layer_keras_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dropout(0.05, input_shape=(input_dim,)))
    model.add(Dense(1024, kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    model.add(Dropout(0.5))    
    model.add(Dense(512, kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    model.add(Dropout(0.5))
    model.add(Dense(256, kernel_initializer='glorot_normal', activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    model.add(Dropout(0.5))
    model.add(Dense(output_dim, kernel_initializer='glorot_normal', activation='softmax', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-5)))
    
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', 'categorical_crossentropy'])
    return model

In [0]:
# start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
          format='[%(asctime)s]: %(message)s ',
          datefmt='%Y-%m-%d %H:%M:%S',
          stream=sys.stdout,
          filemode="w"
          )

In [29]:
logging.info('Load data')
train_df, test_df = load_data()
X, y, X_submission = process_data(train_df, test_df, transform='sqrt')

[2020-03-01 23:18:05]: Load data 


In [30]:
# training phase 1

logging.info('Training phase 1')
models = []
train_models_pred_list=[]
test_models_pred_list=[]
for i in range(10):
  models = [KerasClassifier(build_fn=create_2_layer_keras_model, input_dim=X.shape[1], output_dim=9, nb_epoch=300, batch_size=256, verbose=0)]
  train_models_pred, test_models_pred, summary = models_CV_train(models, X, y, X_submission, n_classes=9, n_folds=5)
  train_models_pred_list.append(train_models_pred)
  test_models_pred_list.append(test_models_pred)

[2020-03-01 23:18:06]: Training phase 1 
Model 0: <keras.wrappers.scikit_learn.KerasClassifier object at 0x000001DF96768E48>
  Fold 0
  logloss: 0.642008
  Fold 1
  logloss: 0.623970
  Fold 2
  logloss: 0.634948
  Fold 3
  logloss: 0.624453
  Fold 4
  logloss: 0.645572
model average logloss: 0.634190
Model 0: <keras.wrappers.scikit_learn.KerasClassifier object at 0x000001DF8E248288>
  Fold 0
  logloss: 0.671102
  Fold 1
  logloss: 0.635295
  Fold 2
  logloss: 0.677158
  Fold 3
  logloss: 0.643713
  Fold 4
  logloss: 0.647527
model average logloss: 0.654959
Model 0: <keras.wrappers.scikit_learn.KerasClassifier object at 0x000001DF96768E48>
  Fold 0
  logloss: 0.650600
  Fold 1
  logloss: 0.631162
  Fold 2
  logloss: 0.627022
  Fold 3
  logloss: 0.636011
  Fold 4
  logloss: 0.651343
model average logloss: 0.639228
Model 0: <keras.wrappers.scikit_learn.KerasClassifier object at 0x000001DF8E248288>
  Fold 0
  logloss: 0.649855
  Fold 1


ResourceExhaustedError: ignored

In [31]:
# training phase 1

logging.info('Training phase 1')
models = []
train_models_pred_list_2=[]
test_models_pred_list_2=[]
for i in range(10):
  models = [MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes = (20, 15),max_iter=500,  verbose = False)]
  train_models_pred, test_models_pred, summary = models_CV_train(models, X, y, X_submission, n_classes=9, n_folds=5)
  train_models_pred_list_2.append(train_models_pred)
  test_models_pred_list_2.append(test_models_pred)

[2020-03-01 23:26:56]: Training phase 1 
Model 0: MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20, 15), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)
  Fold 0
  logloss: 0.582169
  Fold 1
  logloss: 0.560028
  Fold 2
  logloss: 0.569142
  Fold 3
  logloss: 0.572968
  Fold 4
  logloss: 0.566506
model average logloss: 0.570163
Model 0: MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20, 15), learning_rate='constant',
              learning_rat

MemoryError: ignored

In [0]:

train_models_pred_final=np.zeros((61878,9))
for i in train_models_pred_list:
  train_models_pred_final=train_models_pred_final+i
logloss_1 = evaluate(y,train_models_pred_final/len(train_models_pred_list))
print(logloss_1)

In [0]:
train_models_pred_final=np.zeros((61878,9))
test_models_pred_final=np.zeros((144368,9))
for i in range(len(train_models_pred_list_2)):
  train_models_pred_final = train_models_pred_final + train_models_pred_list_2[i]
  test_models_pred_final = test_models_pred_final + test_models_pred_list_2[i]
  logloss_2 = evaluate(y,train_models_pred_final/len(train_models_pred_list_2))
print(logloss_2)

0.5176963300575195


In [0]:
train_models_pred_final=train_models_pred_final/len(train_models_pred_list_2)
test_models_pred_final=test_models_pred_final/len(test_models_pred_list_2)
columns = ['Class_' + str(i + 1) for i in range(9)]
submission_df = pd.DataFrame(test_models_pred_final, columns=columns)
submission_df.index = submission_df.index + 1
submission_df.to_csv('NN_2_12run.csv', sep=',',index_label='id')
np.savetxt("model9_test.csv", test_models_pred_final, delimiter=",")
np.savetxt("model9_train.csv", train_models_pred_final, delimiter=",")