In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from xgboost import XGBClassifier

def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

def load_data(train_data_path='Desktop/otto/train.csv', test_data_path = 'Desktop/otto/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    #header: 指定第几行作为表头，默认为0,即第一行
    #index_col：指定哪一列数据作为行索引
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

def model_CV_train(model, X, y, X_submission, n_classes, n_folds=2):
    summary = {}

    skf = list(StratifiedKFold(n_folds, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

def process_data(X, y, X_submission, ylabel='target', transform=None):
#     X = train_df.drop(ylabel, axis=1).to_numpy()
#     y = train_df[ylabel].to_numpy()
#     X_submission = test_df.to_numpy()
    
    if len(transform.split()) == 1:
        transform = transform.split()[0]
    else:
        k = int(transform.split()[1])
        transform = transform.split()[0] 
#     print(transform, k)
    
    if transform == 'standarization':
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    elif transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'kmeans':
        kmeans = KMeans(n_clusters = k).fit(X)
        X = kmeans.labels_
        X_submission = kmeans.predict(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
#         tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, TSNE(n_components=3).fit_transform(X)))
        X_submission = np.hstack((X_submission, TSNE(n_components=3).fit_transform(X_submission)))    
#     print(X.shape)
    return X, y, X_submission




In [0]:
  start_time = time.time()

  logging.basicConfig(level=logging.DEBUG,
                      format='[%(asctime)s]: %(message)s ',
                      datefmt='%Y-%m-%d %H:%M:%S',
                      stream=sys.stdout,
                      filemode="w"
                      )

  # load data
  logging.info('Load data')
  train_df, test_df = load_data(train_data_path='Desktop/otto/train.csv', test_data_path='Desktop/otto/test.csv')

[2020-03-03 23:34:28]: Load data 


In [0]:
# Process data
X = train_df.drop('target', axis=1).to_numpy()
y = train_df['target'].to_numpy()
X_submission = test_df.to_numpy()

Xo_ = X
yo_ = y
Xo_submission_ = X_submission

X, y, X_submission = process_data(X, y, X_submission, transform = 'tsne+')
Xc_, yc_, Xc_submission_ = process_data(Xo_, yo_, Xo_submission_, transform = 'log')
Xc_, yc_, Xc_submission_ = process_data(Xc_, yc_, Xc_submission_, transform = 'kmeans 9')
#         print(X.shape)
#         print(X_submission.shape)
X = np.hstack((X, Xc_.reshape(-1, 1)))
X_submission = np.hstack((X_submission, Xc_submission_.reshape(-1, 1)))

print(X.shape)
print(X_submission.shape)

(61878, 97)
(144368, 97)


In [0]:
parameters = {
    'estimator__learning_rate': [0.03],
    'estimator__n_estimators': [500], 
    'estimator__max_depth': [28],
    'estimator__gamma' :[0],       
    'estimator__subsample': [0.5],
    'estimator__colsample_bytree': [0.6],
    'estimator__n_jobs': [-1]
    }

# Train model using best parameters
estimator__learning_rate= []
estimator__n_estimators = []
estimator__max_depth=[]
mean_test_score = []
for i in range(len(parameters['estimator__learning_rate'])):
    for k in range(len(parameters['estimator__n_estimators'])):
      for j in range (len(parameters['estimator__max_depth'])):
          model = XGBClassifier(objective='multi:softprob', 
                          n_jobs=-1, 
                          gamma = 1,
                          learning_rate=parameters['estimator__learning_rate'][i],
                          colsample_bytree = 1,
                          n_estimators=parameters['estimator__n_estimators'][k],
                          max_depth=parameters['estimator__max_depth'][j],
                          
                          subsample=0.7)
          print("n_estimators: {}, estimator__max_depth: {},learning_rate: {}".format(parameters['estimator__n_estimators'][k],parameters['estimator__max_depth'][j], parameters['estimator__learning_rate'][i]))
          train_models_pred, test_models_pred, avg_logloss = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=2)
          estimator__learning_rate.append(parameters['estimator__learning_rate'][i])
          estimator__max_depth.append(parameters['estimator__max_depth'][j])
          estimator__n_estimators.append(parameters['estimator__n_estimators'][k])
          mean_test_score.append(avg_logloss)
          end_time = time.time()

n_estimators: 500, estimator__max_depth: 28,learning_rate: 0.03
  Fold 0
  logloss: 0.491452
  Fold 1
  logloss: 0.481934
model average logloss: 0.486693


In [0]:
best_model = XGBClassifier(
        objective = 'multi:softprob',
        n_estimators = 400,
        max_depth = 8,
        learning_rate = 0.1,
        subsample = 0.7,
        colsample_bytree = 1,
        gamma = 1)

# Train model using best parameters
train_models_pred, test_models_pred, summary = model_CV_train(best_model, X, y, X_submission, n_classes=9, n_folds=5)

# Export predictions
np.savetxt("model17_train.csv", train_models_pred, delimiter=",")
np.savetxt("model17_test.csv", test_models_pred, delimiter=",")

  Fold 0
  logloss: 0.458139
  Fold 1
  logloss: 0.466149
  Fold 2
  logloss: 0.454534
  Fold 3
  logloss: 0.442802
  Fold 4
  logloss: 0.456830
model average logloss: 0.455691


In [0]:

# Tune parameters
model = XGBClassifier(objective='multi:softprob');
parameters = {
    'estimator__learning_rate': [0.1,0.2,0.3],
    'estimator__n_estimators': [200,400,600], 
    'estimator__max_depth': [8,10,12],
    'estimator__gamma' :[1],       
    'estimator__subsample': [0.5],
    'estimator__colsample_bytree': [1],
    'estimator__n_jobs': [-1]
    }

classifier=GridSearchCV(model, parameters, scoring='neg_log_loss', verbose=1, n_jobs=-1, cv=5);                               
classifier.fit(X, y);



Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 53.5min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 157.0min finished


In [0]:
print(classifier.cv_results_.keys())
print('best_params:',classifier.best_params_)
print('best_score:',classifier.best_score_)
for i in range(len(classifier.ca_results_['params'])):
  print('{},{}'.format(classifier.cv_results_['params'][i],classifier.cv_results_['mean_test_score'][i]))

NameError: ignored

In [0]:


# Train model using best parameters
train_models_pred, test_models_pred, summary = model_CV_train(model, X, y, X_submission, n_classes=9, n_folds=5)

#     print(X)
#     print(y)

end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))