In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from xgboost import XGBClassifier

def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

def load_data(train_data_path='Desktop/otto/train.csv', test_data_path = 'Desktop/otto/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

def model_CV_train(model, X, y, X_submission, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, random_state=0).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for j, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % j)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]

        model.fit(X_train, y_train)

        y_test_pred = model.predict_proba(X_test)          
        stack_train[test_idx, :] = y_test_pred

        logloss = evaluate(y_test, y_test_pred)
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

        y_submission_pred = model.predict_proba(X_submission)           
        stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

def process_data(X, y, X_submission, ylabel='target', transform=None):
#     X = train_df.drop(ylabel, axis=1).to_numpy()
#     y = train_df[ylabel].to_numpy()
#     X_submission = test_df.to_numpy()
    
    if len(transform.split()) == 1:
        transform = transform.split()[0]
    else:
        k = int(transform.split()[1])
        transform = transform.split()[0] 
#     print(transform, k)
    
    kmeans = 0
    if transform == 'standarization':
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    elif transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
#         tsne = TSNE(n_components=3).fit(X)
#         X = tsne.transform(X)
#         X_submission = tsne.transform(X_submission)
        # X = TSNE(n_components=3).fit_transform(X)
        # X_submission = TSNE(n_components=3).fit_transform(X_submission)
        X_sum = np.vstack((X,X_submission))
        X_sum = TSNE(n_components=3).fit_transform(X_sum)
        X = X_sum[: X.shape[0], : ]
        X_submission = X_sum[ X.shape[0]:,: ]
    elif transform == 'kmeans':
        kmeans = KMeans(n_clusters = k).fit(X)
        X = kmeans.labels_
        X_submission = kmeans.predict(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
#         tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, TSNE(n_components=3).fit_transform(X)))
        X_submission = np.hstack((X_submission, TSNE(n_components=3).fit_transform(X_submission)))       
#     print(X.shape)
    return X, y, X_submission, kmeans



In [0]:
start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )

# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='Desktop/otto/train.csv', test_data_path='Desktop/otto/test.csv')

# Process data
X = train_df.drop('target', axis=1).to_numpy()
y = train_df['target'].to_numpy()
X_submission = test_df.to_numpy() 
X, y, X_submission, _ = process_data(X, y, X_submission, transform = 'log')


[2020-03-13 06:28:36]: Load data 


In [0]:
X, y, X_submission, _ = process_data(X, y, X_submission, transform = 'tsne')

In [0]:


Xo = np.copy(X)
Xo_submission = np.copy(X_submission)    
for i in range(2):
    Xc = np.copy(Xo)
    Xc_submission = np.copy(Xo_submission)
    Xc, y, Xc_submission, kmeans = process_data(Xc, y, Xc_submission, transform = 'kmeans ' + str(8 + i))
    Xc = kmeans.cluster_centers_[Xc]
    Xc_submission = kmeans.cluster_centers_[Xc_submission]
#         print(Xc)
#         print(Xc_submission)
    X = np.hstack((X, Xc))
    X_submission = np.hstack((X_submission, Xc_submission))
#     print(X.shape)
#     print(X_submission.shape)

# Export predictions
np.savetxt("model10_train.csv", X, delimiter=",")
np.savetxt("model10_test.csv", X_submission, delimiter=",")


end_time = time.time()
logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))

[2020-03-13 16:07:13]: Run complete: 9:38:36 elapsed 


In [0]:
columns = ['Class_' + str(i + 1) for i in range(9)]
submission_df = pd.DataFrame(test_models_pred, columns=columns)
submission_df.index = submission_df.index + 1
submission_df.to_csv('Desktop/otto/10_sub.csv', sep=',',index_label='id')

NameError: ignored