In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from xgboost import XGBClassifier
from catboost import Pool, CatBoostClassifier, cv


In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)


def standarize_feature(train_df, test_df, cols):
    scaler = StandardScaler()
    for col in cols:
        train_df[col] = scaler.fit_transform(train_df[col].reshape(-1,1).astype(np.float32))
        test_df[col] = scaler.transform(test_df[col].reshape(-1,1).astype(np.float32))
    return None


def extend_bounds(bins):
    bins[0] = bins[0] - 1
    bins[-1] = bins[-1] + 1

In [0]:
def load_data(train_data_path='UCL course/Applied ML/otto/input_data/train.csv', test_data_path = 'UCL course/Applied ML/otto/input_data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, transform=None):
    numerical_features = train_df.columns
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values

    if standarization:
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    if transform == 'log':
        X = np.log(X+1)
        X_submission = np.log(X_submission+1)
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def models_Split_train(models, X, y, X_submission, n_classes, n_folds=5):
  sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
  for train_index, test_index in sss.split(X, y):
      X_train = X[train_index]
      X_val = X[test_index]

      y_train = y[train_index]
      y_val = y[test_index]
  for i, model in enumerate(models):
    print ("Model %d:" % i, model)   
    avg_logloss = 0
    model.fit(X_train, y_train)
    y_test_pred = model.predict_proba(X_val)
    logloss = evaluate(y_val, y_test_pred)
    print ("  logloss: %f" % logloss)
  return logloss

In [0]:
def models_CV_train(models, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds,shuffle=True, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes, len(models)))
    stack_test = np.zeros((X_submission.shape[0], n_classes, len(models)))
    
    for i, model in enumerate(models):
        print ("Model %d:" % i, model)
        
        avg_logloss = 0
        
        stack_test_model_i = np.zeros((X_submission.shape[0], n_classes, len(skf)))
        for j, (train_idx, test_idx) in enumerate(skf):
            print ("  Fold %d" % j)
            X_train = X[train_idx]
            y_train = y[train_idx]
            X_test = X[test_idx]
            y_test = y[test_idx]

            train_dataset = Pool(data=X_train, label=y_train)
            eval_dataset = Pool(data=X_test, label=y_test)

            model.fit(train_dataset)
            
            y_test_pred = model.predict_proba(eval_dataset)          
            stack_train[test_idx, :, i] = y_test_pred
            
            logloss = evaluate(y_test, y_test_pred)
            avg_logloss += logloss
            print ("  logloss: %f" % logloss)
            
            y_submission_pred = model.predict_proba(X_submission)           
            stack_test_model_i[:, :, j] = y_submission_pred
        
        avg_logloss = avg_logloss / n_folds
        print ("model average logloss: %f" % avg_logloss)
        summary[i] = avg_logloss
        
        stack_test[:, :, i] = stack_test_model_i.mean(axis=2)

    return np.swapaxes(stack_train, 1, 2).reshape((X.shape[0], -1)), np.swapaxes(stack_test, 1, 2).reshape((X_submission.shape[0], -1)), summary

In [0]:
# start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
          format='[%(asctime)s]: %(message)s ',
          datefmt='%Y-%m-%d %H:%M:%S',
          stream=sys.stdout,
          filemode="w"
          )

In [9]:
logging.info('Load data')
train_df, test_df = load_data()
X, y, X_submission = process_data(train_df, test_df, standarization=True)


[2020-03-02 16:39:49]: Load data 
[2020-03-02 16:39:49]: NumExpr defaulting to 8 threads. 


In [0]:
# train_models_pred, test_models_pred, summary = models_Split_train(models, X, y, X_submission, n_classes=9, n_folds=5)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_index, test_index in sss.split(X, y):
    X_train = X[train_index]
    X_val = X[test_index]
    y_train = y[train_index]
    y_val = y[test_index] 

iteration_list=[2700,2200,1700,1200,1000]
depth_list=[4,5,6,7,8,9,10]
learning_rate_list=[0.1,0.15,0.2,0.25,0.3]
l2_leaf_reg_list=[1,2,3,4,5]
num=1
loss=[]

train_dataset = Pool(data=X_train, label=y_train)
eval_dataset = Pool(data=X_val, label=y_val)

for i in range(1):
  for j in range(num):
    for k in range(1):
      # model = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.5, loss_function='MultiClass',verbose=False) 
      # model.fit(train_dataset)
      model = CatBoostClassifier(iterations=1300,depth=7,learning_rate=0.2,l2_leaf_reg=3,loss_function='MultiClass')
      # model.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
      model.fit(train_dataset)
      y_test_pred = model.predict_proba(eval_dataset)
      logloss = evaluate(y_val, y_test_pred)
      print ("  logloss: %f" % logloss)
      loss.append(logloss)

print(loss)

In [24]:

models = [CatBoostClassifier(iterations=1300,depth=7,learning_rate=0.2,l2_leaf_reg=3,loss_function='MultiClass',verbose=False)]
train_models_pred, test_models_pred, summary = models_CV_train(models, X, y, X_submission, n_classes=9, n_folds=5)

Model 0: <catboost.core.CatBoostClassifier object at 0x0000021B9C26FFC8>
  Fold 0
  logloss: 0.475385
  Fold 1
  logloss: 0.470944
  Fold 2
  logloss: 0.478704
  Fold 3
  logloss: 0.484034
  Fold 4
  logloss: 0.480354
model average logloss: 0.477884


In [0]:
columns = ['Class_' + str(i + 1) for i in range(9)]
submission_df = pd.DataFrame(test_models_pred, columns=columns)
submission_df.index = submission_df.index + 1
submission_df.to_csv('Catboost.csv', sep=',',index_label='id')
np.savetxt("model34_test.csv", test_models_pred, delimiter=",")
np.savetxt("model34_train.csv", train_models_pred, delimiter=",")