In [1]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

Using TensorFlow backend.


In [0]:
def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

In [0]:
def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values
    
    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:
def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

In [7]:
start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='log')

[2020-02-27 02:56:04]: Load data 
[2020-02-27 02:56:05]: NumExpr defaulting to 8 threads. 


In [0]:
# Tune parameters

model =  LogisticRegression(penalty='l2', random_state=42)
parameters = {
    # 'C': [5.0, 4.0, 3.0, 2.0, 1.0, 0.1, 0.15, 0.01, 0.015, 0.001]
    'C': np.linspace(3.0, 5.0, num=200)
    }
classifier=GridSearchCV(estimator=model, param_grid=parameters, scoring='neg_log_loss', verbose=1, n_jobs=-1, cv=5);  

In [0]:
classifier.fit(X, y);

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [0]:
print(classifier.cv_results_.keys())
print('best_params: ',classifier.best_params_)
print('best_score: ',classifier.best_score_)
for i in range(len(classifier.cv_results_['params'])):
  print('{}, {}'.format(classifier.cv_results_['params'][i], classifier.cv_results_['mean_test_score'][i]))

In [0]:
best_model = LogisticRegression(penalty='l2', 
                                C = classifier.best_params_['C'],
                                random_state=42,
                                n_jobs = -1 ) 

In [0]:
model_2_path = "applied/otto/model_2.csv"
C = []
mean_test_score = []
if os.path.exists(model_2_path):
    previous_gridsearch = pd.read_csv(model_2_path, sep=',', header=0)
    for i in range(previous_gridsearch.values.shape[0]):
        C.append(previous_gridsearch.values[i][0])
        mean_test_score.append(previous_gridsearch.values[i][1])

In [0]:
submit = pd.DataFrame()

for i in range(len(classifier.cv_results_['params'])):
    C.append(classifier.cv_results_['params'][i]['C'])
    mean_test_score.append(classifier.cv_results_['mean_test_score'][i])
  # print(classifier.cv_results_['params'][i]['n_estimators'])
submit["C"] = C
submit["mean_test_score"] = mean_test_score
submit.to_csv("applied/otto/model_2.csv", index = False)

In [0]:
# Train model using best parameters
train_models_pred, test_models_pred, summary = model_CV_train(best_model, X, y, X_submission, n_classes=9, n_folds=5)
end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

In [0]:
pkl_filename = "applied/otto/train_models_pred_model_2.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(train_models_pred, file)
pkl_filename = "applied/otto/test_models_pred_model_2.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(test_models_pred, file)
# with open(pkl_filename, 'rb') as file:
#   test = pickle.load(file)
# print(test.shape)
pkl_filename = "applied/otto/summary_model_2.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(summary, file)