In [3]:
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from xgboost import XGBClassifier

def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, random_state=0, shuffle=True).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]
        
#         for i in range(0, 10):
#             print((y_test == i).sum(0))
                  
        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

def main():
    start_time = time.time()

    logging.basicConfig(level=logging.DEBUG,
                        format='[%(asctime)s]: %(message)s ',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        stream=sys.stdout,
                        filemode="w"
                        )

    # load data
    logging.info('Load data')
    train_df, test_df = load_data(train_data_path='train.csv', test_data_path='test.csv')
    
    # Process data
    sumColumn=train_df.apply(lambda row: row.sum(), axis=1);
    train_df['sum']=sumColumn

    sumColumnT=test_df.apply(lambda row: row.sum(), axis=1);
    test_df['sum']=sumColumnT

    X = train_df.drop('target', axis=1).to_numpy()
    y = train_df['target'].to_numpy()
    X_submission = test_df.to_numpy()
#     print(X.shape[1])
    
    # Tune parameters
    model = OneVsRestClassifier(XGBClassifier(objective='binary:logistic'))
    parameters = {
        'estimator__learning_rate': [0.1,0.3],
        'estimator__n_estimators': [200,400,600], 
        'estimator__max_depth': [9,12],
        'estimator__gamma' :[1],       
        'estimator__subsample': [0.5],
        'estimator__colsample_bytree': [1],
        'estimator__n_jobs': [-1]
        }

    classifier=GridSearchCV(model, parameters, scoring='neg_log_loss', verbose=1, n_jobs=-1, cv=5);                               
    classifier.fit(X, y);

    print(classifier.cv_results_.keys())
    print('best_params: ',classifier.best_params_)
    print('best_score: ',classifier.best_score_)
    for i in range(len(classifier.cv_results_['params'])):
        print('{}, {}'.format(classifier.cv_results_['params'][i], classifier.cv_results_['mean_test_score'][i]))

    # Train model using best parameters
    model = OneVsRestClassifier(XGBClassifier(
        objective = 'binary:logistic',
        n_estimators = classifier.best_params_['n_estimators'],
        max_depth = classifier.best_params_['max_depth'],
        learning_rate = classifier.best_params_['learning_rate'],
        subsample = classifier.best_params_['subsample'],
        colsample_bytree = classifier.best_params_['colsample_bytree'],
        gamma = classifier.best_params_['gamma']))
    
    train_models_pred, test_models_pred, summary = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)
    
    # Export predictions
    np.savetxt("model14_train.csv", train_models_pred, delimiter=",")
    np.savetxt("model14_test.csv", test_models_pred, delimiter=",")
    
#     print(X)
#     print(y)
    
    end_time = time.time()
    logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))
    
if __name__ == '__main__':
    main()

[2020-03-02 01:26:46]: Load data 
94
[2020-03-02 01:26:57]: Run complete: 0:00:10 elapsed 
