In [18]:
import sys
import logging
import time
import numpy as np
import pandas as pd

# from svmlight_loader import dump_svmlight_file
from sklearn.datasets import dump_svmlight_file
import subprocess
from subprocess import call, check_output
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
import pickle
from itertools import combinations
import matplotlib.pyplot as plt
import random
# import pegasos


def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

def rSubset(arr, r): 
  
    # return list of all subsets of length r 
    # to deal with duplicate subsets use  
    # set(list(combinations(arr, r))) 
    return list(combinations(arr, r)) 

def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

def export_CV_data(X, y, X_submission, n_classes, n_folds=5):
    # change these filenames to reflect your system
    file_prefix = "C:/Users/JasonLzF/sofia/"
#     file_prefix = ""
    
    skf = list(StratifiedKFold(n_folds, random_state=0, shuffle=True).split(X, y))
    
    for i, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % i)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]
        
        for j in range(0, n_classes):
            # note that for sofia-ml (and vowpal wabbit), labels need to be {-1,1}, not {0,1}, so we change them
            print((y_train != j).sum(0))
            tmp_y_train = np.copy(y_train)
            tmp_y_train[y_train == j] = 1
            tmp_y_train[y_train != j] = -1
            print((tmp_y_train == -1).sum(0))
            
            # change filenames 
            model_file = file_prefix + "model" + str(i) + "_clf" + str(j) + ".model"
            training_file = file_prefix + "train_data" + str(i) + "_clf" + str(j) + ".dat"
            test_file = file_prefix + "test_data" + str(i) + "_clf" + str(j) + ".dat"
            pred_file = file_prefix + "pred" + str(i) + "_clf" + str(j) + ".csv"
            print("model_file: " + model_file)
            print("training_file: " + training_file)
            print("test_file: " + test_file)
            print("pred_file: " + pred_file)
            
            # export data
            dump_svmlight_file(X_train, tmp_y_train, training_file, zero_based=False)
            dump_svmlight_file(X_test, np.zeros((X_test.shape[0],)), test_file, zero_based=False)
            
def export_data(X_submission):
    # change these filenames to reflect your system
    file_prefix = "C:/Users/JasonLzF/sofia/"
#     file_prefix = ""
    test_file = file_prefix + "test_data.dat"
    
    # export data
    dump_svmlight_file(X_submission, np.zeros((X_submission.shape[0],)), test_file, zero_based=False)

def model_CV_train(X, y, X_submission, loop_type, n_classes, n_folds=5):
    summary = {}

    file_prefix = "\"C:/Users/JasonLzF/sofia"
    file_prefix_ = "C:/Users/JasonLzF/sofia"
    test_file_ = file_prefix + "/test_data.dat\""
    skf = list(StratifiedKFold(n_folds, random_state=0, shuffle=True).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stacklg_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)

    avg_logloss = 0
    avglg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for i, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % i)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]
        
#         for i in range(0, 9):
#             print((y_test == i).sum(0))
#         print((y_test == 3).sum(0) / y_test.shape[0])
        for j in range(0, n_classes):  
            print ("  Clf %d" % j)
            # change filenames 
            model_file = file_prefix + "/model" + str(i) + "_clf" + str(j) + ".model\""
            training_file = file_prefix + "/train_data" + str(i) + "_clf" + str(j) + ".dat\""
            test_file = file_prefix + "/test_data" + str(i) + "_clf" + str(j) + ".dat\""
            pred_file = file_prefix + "/pred" + str(i) + "_clf" + str(j) + ".csv\""
            pred_file_test = file_prefix + "/pred_test" + str(i) + "_clf" + str(j) + ".csv\""
#             print("model_file: " + model_file)
#             print("training_file: " + training_file)
#             print("test_file: " + test_file)
#             print("pred_file: " + pred_file)
            
            # train via subprocess call
#             call("sofia-ml.exe --learner_type logreg-pegasos --loop_type balanced-stochastic --prediction_type logistic --iterations 100000 --training_file "+training_file+" --model_out "+model_file, shell = True)
            check_output("sofia-ml.exe --rank_step_probability 0.8 --lambda 0.001 --passive_aggressive_c 0.1 --passive_aggressive_lambda 0.001 --iterations 300000 --learner_type logreg-pegasos --loop_type "+loop_type+" --prediction_type logistic --training_file "+training_file+" --model_out "+model_file, shell = True)
#             call("sofia-ml.exe --learner_type logreg-pegasos --loop_type combined-roc --prediction_type logistic --training_file "+training_file+" --model_out "+model_file, shell = True)
            # create test data via subprocess call
            check_output("sofia-ml.exe --model_in "+model_file+" --test_file "+test_file+" --results_file "+pred_file, shell = True)
#             call("sofia-ml.exe --model_in "+model_file+" --test_file "+training_file+" --results_file "+pred_file, shell = True)
            check_output("sofia-ml.exe --model_in "+model_file+" --test_file "+test_file_+" --results_file "+pred_file_test, timeout = 100, shell = True) 
#             print('train&predict done')
    
            # read in test data
#             to_numpy()
            pred_file = file_prefix_ + "/pred" + str(i) + "_clf" + str(j) + ".csv"
            pred_file_test = file_prefix_ + "/pred_test" + str(i) + "_clf" + str(j) + ".csv"
            pred_prob  = pd.io.parsers.read_csv(pred_file, sep="\t", names=["pred","true"])['pred'] 
            pred_prob_test  = pd.io.parsers.read_csv(pred_file_test, sep="\t", names=["pred","true"])['pred']
            
            # do logistic transformation to get probabilities
            pred_prob = 1./(1.+np.exp(-pred_prob))
            pred_prob_test = 1./(1.+np.exp(-pred_prob_test))
            
#             stack_train[test_idx, j] = pred_prob[:, 1]
            stack_train[test_idx, j] = pred_prob
            stack_test_model[:, j, i] = pred_prob_test
#             print('assign')

        logloss = evaluate(y_test, stack_train[test_idx, :])
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)
    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss
    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

def get_model_stack_test(n_classes=9, n_folds=5):
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    stack_test_model = np.zeros((X_submission.shape[0], n_classes, n_folds))
    file_prefix_ = "C:/Users/JasonLzF/sofia"
    
    for i in range(n_folds):
        for j in range(n_classes):
            pred_file_test = file_prefix_ + "/pred_test" + str(i) + "_clf" + str(j) + ".csv"
            pred_prob_test  = pd.io.parsers.read_csv(pred_file_test, sep="\t", names=["pred","true"])['pred']
            
            # do logistic transformation to get probabilities
            pred_prob_test = 1./(1.+np.exp(-pred_prob_test))
            
            stack_test_model[:, j, i] = pred_prob_test
    stack_test[:, :] = stack_test_model.mean(axis=2)
    
    return stack_test       

def generate_sets():
    # 3 level interactions
    arr = [33, 47, 15, 38, 61, 67, 59, 66, 21, 17, 13, 10, 42]
    sets = rSubset(arr, 3)
    print('number of sets' + str(len(sets)))
    random.seed(30)
    sa = random.sample(range(0, len(sets)), 4)
    sets1 = []
    for i in sa:
        print(i)
        sets1.append(sets[i])
        print(sets1)
        
    for set in sets1:
        print(set)
    
    return sets1

def interactions(sets, X, y, X_submission, Xcs, Xcs_submission, transform_):
    summaries = []
    Xo = np.copy(X)
    Xo_submission = np.copy(X_submission)
#     Xc = np.copy(X)
#     Xc_submission = np.copy(X_submission)
    
#     Xcs = np.copy(Xo)
#     Xcs_submission = np.copy(Xo_submission)
#     print('tsne starts')
#     Xcs, y, Xcs_submission = process_data(Xcs, y, Xcs_submission, transform = 'tsne')
#     print('tsne finishes')
    X, y, X_submission = process_data(X, y, X_submission, transform = transform_)
    X = np.hstack((X, Xcs))
    X_submission = np.hstack((X_submission, Xcs_submission))
    for set in sets:
#         print(set)
        new_feature = np.log1p(Xo[:, set[0]] * Xo[:, set[1]] * Xo[:, set[2]])
#         if transform_ == 'log':
#             new_feature += 1
#         print((new_feature != 0).sum(0))
        X = np.hstack((X, new_feature.reshape(-1, 1)))
        new_feature_ = np.log1p(Xo_submission[:, set[0]] * Xo_submission[:, set[1]] * Xo_submission[:, set[2]])
#         if transform_ == 'log':
#             new_feature_ += 1
        X_submission = np.hstack((X_submission, new_feature_.reshape(-1, 1)))
    
    if transform_ != 'log':
        print('not log')
        X, y, X_submission = process_data(X, y, X_submission, transform = transform_)
#     X, y, X_submission = process_data(X, y, X_submission, transform = 'log')
#         export_CV_data(X, y, X_submission, 9)
#         train_models_pred, test_models_pred, summary = model_CV_train(X, y, X_submission, 9)
#     summaries = np.hstack((summaries, summary))
    
    return X, y, X_submission

def process_data(X, y, X_submission, ylabel='target', transform=None):
#     X = train_df.drop(ylabel, axis=1).to_numpy()
#     y = train_df[ylabel].to_numpy()
#     X_submission = test_df.to_numpy()
    print(transform)
    
    if len(transform.split()) == 1:
        transform = transform.split()[0]
    else:
        k = int(transform.split()[1])
        transform = transform.split()[0] 
#     print(transform, k)
    
    if transform == 'standarization':
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_submission = scaler.transform(X_submission)
    elif transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
#         tsne = TSNE(n_components=3).fit(X)
#         X = tsne.transform(X)
#         X_submission = tsne.transform(X_submission)
        X = TSNE(n_components=3).fit_transform(X)
        X_submission = TSNE(n_components=3).fit_transform(X_submission)
    elif transform == 'kmeans':
        kmeans = KMeans(n_clusters = k).fit(X)
        X = kmeans.labels_
        X_submission = kmeans.predict(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))   
#     print(X.shape)
    return X, y, X_submission

def main():
    start_time = time.time()

    logging.basicConfig(level=logging.DEBUG,
                        format='[%(asctime)s]: %(message)s ',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        stream=sys.stdout,
                        filemode="w"
                        )

    # load data
    logging.info('Load data')
    train_df, test_df = load_data(train_data_path='train.csv', test_data_path='test.csv')
    
    # Process data
    X = train_df.drop('target', axis=1).to_numpy()
    y = train_df['target'].to_numpy()
    X_submission = test_df.to_numpy()
    
    # Model 11
    Xo = np.copy(X)
    Xo_submission = np.copy(X_submission)
    Xo, y, Xo_submission = process_data(Xo, y, Xo_submission, transform = 'standarization')
    
    export_CV_data(Xo, y, Xo_submission, 9)
    export_data(Xo_submission)
    train_models_pred, test_models_pred, summary = model_CV_train(X, y, X_submission, "balanced-stochastic", 9)
#     train_models_pred, test_models_pred, summary = model_CV_train(X, y, X_submission, "combined-roc", 9)
#     test_models_pred = get_model_stack_test
    
    # Export predictions
    np.savetxt("model11_train.csv", train_models_pred, delimiter=",")
    np.savetxt("model11_test.csv", test_models_pred, delimiter=",")
    
    # Model 12
    Xo = np.copy(X)
    Xo_submission = np.copy(X_submission)
    Xcs = np.copy(X)
    Xcs_submission = np.copy(X_submission)
    print('tsne starts')
    Xcs, y, Xcs_submission = process_data(Xcs, y, Xcs_submission, transform = 'tsne')
    print('tsne finishes')
    np.savetxt("tsne_raw_train.csv", Xcs, delimiter=",")
    np.savetxt("tsne_raw_test.csv", Xcs_submission, delimiter=",")
    Xo, y, Xo_submission = interactions(generate_sets(), Xo, y, Xo_submission, Xcs, Xcs_submission,'standarization')
    
    export_CV_data(Xo, y, Xo_submission, 9)
    export_data(Xo_submission)
    train_models_pred, test_models_pred, summary = model_CV_train(X, y, X_submission, "balanced-stochastic", 9)
    test_models_pred = get_model_stack_test
    
    # Export predictions
    np.savetxt("model12_train.csv", train_models_pred, delimiter=",")
    np.savetxt("model12_test.csv", test_models_pred, delimiter=",")
    
    # Model 13
    Xo = np.copy(X)
    Xo_submission = np.copy(X_submission)
    Xcs = pd.read_csv("tsne_raw_train.csv", header=None).to_numpy()
    Xcs_submission = pd.read_csv("tsne_raw_test.csv", header=None).to_numpy()
    
#     print(Xcs.shape)
#     print(Xcs_submission.shape)
#     print(np.log1p(Xcs))
    Xo, y, Xo_submission = interactions(generate_sets(), Xo, y, Xo_submission, Xcs, Xcs_submission, 'log')
    
    export_CV_data(Xo, y, Xo_submission, 9)
    export_data(Xo_submission)
    train_models_pred, test_models_pred, summary = model_CV_train(X, y, X_submission, "combined-roc", 9)
# #     test_models_pred = get_model_stack_test
    
#     # Export predictions
    np.savetxt("model13_train.csv", train_models_pred, delimiter=",")
    np.savetxt("model13_test.csv", test_models_pred, delimiter=",")

    end_time = time.time()
    logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))
    
if __name__ == '__main__':
    main()

[2020-03-07 05:12:38]: Load data 
  Fold 0
47959
47959
model_file: C:/Users/JasonLzF/sofia/model0_clf0.model
training_file: C:/Users/JasonLzF/sofia/train_data0_clf0.dat
test_file: C:/Users/JasonLzF/sofia/test_data0_clf0.dat
pred_file: C:/Users/JasonLzF/sofia/pred0_clf0.csv
36605
36605
model_file: C:/Users/JasonLzF/sofia/model0_clf1.model
training_file: C:/Users/JasonLzF/sofia/train_data0_clf1.dat
test_file: C:/Users/JasonLzF/sofia/test_data0_clf1.dat
pred_file: C:/Users/JasonLzF/sofia/pred0_clf1.csv
43098
43098
model_file: C:/Users/JasonLzF/sofia/model0_clf2.model
training_file: C:/Users/JasonLzF/sofia/train_data0_clf2.dat
test_file: C:/Users/JasonLzF/sofia/test_data0_clf2.dat
pred_file: C:/Users/JasonLzF/sofia/pred0_clf2.csv
47350
47350
model_file: C:/Users/JasonLzF/sofia/model0_clf3.model
training_file: C:/Users/JasonLzF/sofia/train_data0_clf3.dat
test_file: C:/Users/JasonLzF/sofia/test_data0_clf3.dat
pred_file: C:/Users/JasonLzF/sofia/pred0_clf3.csv
47310
47310
model_file: C:/Users/

  Fold 4
47959
47959
model_file: C:/Users/JasonLzF/sofia/model4_clf0.model
training_file: C:/Users/JasonLzF/sofia/train_data4_clf0.dat
test_file: C:/Users/JasonLzF/sofia/test_data4_clf0.dat
pred_file: C:/Users/JasonLzF/sofia/pred4_clf0.csv
36606
36606
model_file: C:/Users/JasonLzF/sofia/model4_clf1.model
training_file: C:/Users/JasonLzF/sofia/train_data4_clf1.dat
test_file: C:/Users/JasonLzF/sofia/test_data4_clf1.dat
pred_file: C:/Users/JasonLzF/sofia/pred4_clf1.csv
43100
43100
model_file: C:/Users/JasonLzF/sofia/model4_clf2.model
training_file: C:/Users/JasonLzF/sofia/train_data4_clf2.dat
test_file: C:/Users/JasonLzF/sofia/test_data4_clf2.dat
pred_file: C:/Users/JasonLzF/sofia/pred4_clf2.csv
47350
47350
model_file: C:/Users/JasonLzF/sofia/model4_clf3.model
training_file: C:/Users/JasonLzF/sofia/train_data4_clf3.dat
test_file: C:/Users/JasonLzF/sofia/test_data4_clf3.dat
pred_file: C:/Users/JasonLzF/sofia/pred4_clf3.csv
47312
47312
model_file: C:/Users/JasonLzF/sofia/model4_clf4.model
t

In [14]:
np.log(-10)

  """Entry point for launching an IPython kernel.


nan