In [2]:
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
import itertools

from xgboost import XGBClassifier

def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

def process_data(X, y, X_submission, ylabel='target', transform=None):
#     X = train_df.drop(ylabel, axis=1).to_numpy()
#     y = train_df[ylabel].to_numpy()
#     X_submission = test_df.to_numpy()
    
    if len(transform.split()) == 1:
        transform = transform.split()[0]
    else:
        k = int(transform.split()[1])
        transform = transform.split()[0] 
#     print(transform, k)
    
    if transform == 'standarization':
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    elif transform == 'log':
        X = np.log1p(X + 1)
        X_submission = np.log1p(X_submission + 1)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'kmeans':
        kmeans = KMeans(n_clusters = k).fit(X)
        X = kmeans.labels_
        X_submission = kmeans.predict(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))   
#     print(X.shape)
    return X, y, X_submission

def main():
    start_time = time.time()

    logging.basicConfig(level=logging.DEBUG,
                        format='[%(asctime)s]: %(message)s ',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        stream=sys.stdout,
                        filemode="w"
                        )

    # load data
    logging.info('Load data')
    train_df, test_df = load_data(train_data_path='train.csv', test_data_path='test.csv')
    
    # Process data
    X = train_df.drop('target', axis=1).to_numpy()
    y = train_df['target'].to_numpy()
    X_submission = test_df.to_numpy()
    
    ks = [6, 7, 8, 9, 10, 11, 12]
    Xo = np.copy(X)
    Xo_submission = np.copy(X_submission)
    for k in ks:
#         print(k)
        Xc = np.copy(Xo)
        Xc_submission = np.copy(Xo_submission)
        Xc, y, Xc_submission = process_data(Xc, y, Xc_submission, transform = 'kmeans ' + str(k))
        X = np.hstack((X, Xc.reshape(-1, 1)))
        X_submission = np.hstack((X_submission, Xc_submission.reshape(-1, 1)))

#     Xc = np.copy(Xo)
#     Xc_submission = np.copy(Xo_submission)
    X = np.hstack((X, (Xo == 0).sum(1).reshape(-1, 1))) # rowSums(X==0)
    X_submission = np.hstack((X_submission, (Xo_submission == 0).sum(1).reshape(-1, 1))) # rowSums(X==0)
    
    Xc = np.copy(Xo)
    Xc_submission = np.copy(Xo_submission)
    Xc, y, Xc_submission = process_data(Xc, y, Xc_submission, transform = 'standarization')
    X = np.hstack((X, (Xc > 0.5).sum(1).reshape(-1, 1))) # rowSums(Scale(X)>0.5)
    X_submission = np.hstack((X_submission, (Xc_submission > 0.5).sum(1).reshape(-1, 1))) # rowSums(Scale(X)>0.5)
#     print(X)
#     print(X_submission)
    X = np.hstack((X, (Xc < -0.5).sum(1).reshape(-1, 1))) # rowSums(Scale(X)< -0.5)
    X_submission = np.hstack((X_submission, (Xc_submission < -0.5).sum(1).reshape(-1, 1))) # rowSums(Scale(X)< -0.5)
#     print(X.shape)
#     print(X_submission.shape)
    
    # Tune parameters
    model = XGBClassifier(objective='multi:softprob');
    parameters = {
        'estimator__learning_rate': [0.1,0.3],
        'estimator__n_estimators': [200,400,600], 
        'estimator__max_depth': [9,12],
        'estimator__gamma' :[1],       
        'estimator__subsample': [0.5],
        'estimator__colsample_bytree': [1],
        'estimator__n_jobs': [-1]
        }
    
    learning_rate_ = []
    n_estimators_ = []
    max_depth_ = []
    gamma_ = []
    subsample_ = []
    colsample_bytree_ = []
    mean_test_score = []
    
    keys = list(parameters)
    for params in itertools.product(*map(parameters.get, keys)):
#         print(params)
        model = XGBClassifier(
            objective = 'multi:softprob',
            learning_rate = params[0],
            n_estimators = params[1],
            max_depth = params[2],
            gamma = params[3],
            subsample = params[4],
            colsample_bytree = params[5]
            )
        print("learning_rate: {}, n_estimators: {}, max_depth: {}, gamma: {}, subsample: {}, colsample_bytree: {} ".format(params[0], params[1], params[2], params[3], params[4], params[5]))
        _, _, summary = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)
        print()
#         summary = 0
        learning_rate_.append(params[0])
        n_estimators_.append(params[1])
        max_depth_.append(params[2])
        gamma_.append(params[3])
        subsample_.append(params[4])
        colsample_bytree_.append(params[5])
        mean_test_score.append(summary)
    best = np.argmin(mean_test_score)  
    print("Best parameters: learning_rate: {}, n_estimators: {}, max_depth: {}, gamma: {}, subsample: {}, colsample_bytree: {} \n mean_test_score: {} ".format(learning_rate_[best], n_estimators_[best], max_depth_[best], gamma_[best], subsample_[best], colsample_bytree_[best], mean_test_score[best]))    
    
#     classifier=GridSearchCV(model, parameters, scoring='neg_log_loss', verbose=1, n_jobs=-1, cv=5);                               
#     classifier.fit(X, y);

#     print(classifier.cv_results_.keys())
#     print('best_params: ',classifier.best_params_)
#     print('best_score: ',classifier.best_score_)
#     for i in range(len(classifier.cv_results_['params'])):
#         print('{}, {}'.format(classifier.cv_results_['params'][i], classifier.cv_results_['mean_test_score'][i]))

    # Train model using best parameters
#     model = XGBClassifier(
#         objective = 'multi:softprob',
#         n_estimators = classifier.best_params_['n_estimators'],
#         max_depth = classifier.best_params_['max_depth'],
#         learning_rate = classifier.best_params_['learning_rate'],
#         subsample = classifier.best_params_['subsample'],
#         colsample_bytree = classifier.best_params_['colsample_bytree'],
#         gamma = classifier.best_params_['gamma'])

    best_model = XGBClassifier(
        objective = 'multi:softprob',
        n_estimators = n_estimators_[best],
        max_depth = max_depth_[best],
        learning_rate = learning_rate_[best],
        subsample = subsample_[best],
        colsample_bytree = colsample_bytree_[best],
        gamma = gamma_[best])

    # Train model using best parameters
    train_models_pred, test_models_pred, summary = model_CV_train(best_model, X, y, X_submission, n_classes=9, n_folds=5)
    
    # Export predictions
    np.savetxt("model15_train.csv", train_models_pred, delimiter=",")
    np.savetxt("model15_test.csv", test_models_pred, delimiter=",")
    
    end_time = time.time()
    logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))
    
if __name__ == '__main__':
    main()

[2020-03-04 00:27:46]: Load data 
learning_rate: 0.1, n_estimators: 200, max_depth: 9, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.1, n_estimators: 200, max_depth: 12, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.1, n_estimators: 400, max_depth: 9, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.1, n_estimators: 400, max_depth: 12, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.1, n_estimators: 600, max_depth: 9, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.1, n_estimators: 600, max_depth: 12, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.3, n_estimators: 200, max_depth: 9, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.3, n_estimators: 200, max_depth: 12, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.3, n_estimators: 400, max_depth: 9, gamma: 1, subsample: 0.5, colsample_bytree: 1 

learning_rate: 0.3, n_estimators: 400, max_depth: 12,



KeyboardInterrupt: 