In [1]:
import xlearn as xl
import sys
import logging
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
import itertools

def elapsed_time(start_time, end_time):
    elapsed_sec = end_time - start_time
    h = int(elapsed_sec / (60 * 60))
    m = int((elapsed_sec % (60 * 60)) / 60)
    s = int(elapsed_sec % 60)
    return "{}:{:>02}:{:>02}".format(h, m, s)

def evaluate(y, y_pred):
    logloss = log_loss(y, y_pred)
    return logloss

def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

def model_CV_train(param, X, y, X_submission, train_df, test_df, n_classes, n_folds=5):
    summary = {}

    skf = list(StratifiedKFold(n_folds, random_state=0, shuffle=True).split(X, y))
    
    stack_train = np.zeros((X.shape[0], n_classes))
    stack_test = np.zeros((X_submission.shape[0], n_classes))
    
#     print("Model :" model)
    param_ = {'task':'binary', 
         'lr': param[0],
         'lambda': param[1],
         'k': param[2],
         'init': param[3],
         'metric':'acc'}
    
    categories = {}
#             numerics = {}
    features = train_df.drop('target', axis=1).columns
    X_submission_df = pd.DataFrame(data=X_submission)
    X_submission_df['target'] = 0
    X_submission_df.columns = train_df.columns
    convert_to_ffm(X_submission_df, 'test1', features, categories, features)

    avg_logloss = 0

    stack_test_model = np.zeros((X_submission.shape[0], n_classes, len(skf)))
    for i, (train_idx, test_idx) in enumerate(skf):
        print ("  Fold %d" % i)
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_test = X[test_idx]
        y_test = y[test_idx]
#         print(test_idx.shape[0])
        
        for j in range(0, n_classes):
            print ("  Clf %d" % j)
            # Label the samples
            tmp_y_train = np.copy(y_train)
            tmp_y_train[y_train == j] = 1
            tmp_y_train[y_train != j] = 0
            X_train_df = pd.DataFrame(data=X_train)
            X_train_df['target'] = tmp_y_train
            X_train_df.columns = train_df.columns
    #         print(X_train_df.columns)

            # Convert train data to libffm format
            convert_to_ffm(X_train_df, 'train', features, categories, features)
#             convert_to_ffm(X_train_df, 'train', numerics, features, features)
            ffm_model = xl.create_ffm()
#             ffm_model = xl.create_fm()
            ffm_model.setTrain("train_ffm.txt")

            # Start to train
            # The trained model will be stored in model.out
            ffm_model.fit(param_, './model.out')

            # Convert test data to libffm format
            X_test_df = pd.DataFrame(data=X_test)
            X_test_df['target'] = y_test
            X_test_df.columns = train_df.columns
#             print(X_test_df.shape[0])    
            convert_to_ffm(X_test_df, 'test', features, categories, features)
#             convert_to_ffm(X_test_df, 'test', numerics, features, features)

            # Prediction task
            ffm_model.setTest("test_ffm.txt") # Test data
            ffm_model.setSigmoid() # Convert output to 0-1

            # Start to predict
            # The output result will be stored in output.txt
            ffm_model.predict("./model.out", "./output.txt")
            
            # Prediction task
            ffm_model.setTest("test1_ffm.txt") # Test data
            ffm_model.setSigmoid() # Convert output to 0-1

            # Start to predict
            # The output result will be stored in output.txt
            ffm_model.predict("./model.out", "./output1.txt")
            
            # Read in the output result and compute loss
            y_test_pred = pd.read_csv("./output.txt", header=None).to_numpy().reshape(1, -1)
            y_tests_pred = pd.read_csv("./output1.txt", header=None).to_numpy().reshape(1, -1)
#             print(y_test_pred.shape)   
#             print(stack_train[test_idx, j].shape)
            stack_train[test_idx, j] = y_test_pred
            stack_test_model[:, j, i] = y_tests_pred

        logloss = evaluate(y_test, stack_train[test_idx, :])
        avg_logloss += logloss
        print ("  logloss: %f" % logloss)

    #         y_submission_pred = model.predict_proba(X_submission)           
    #         stack_test_model[:, :, j] = y_submission_pred

    avg_logloss = avg_logloss / n_folds
    print ("model average logloss: %f" % avg_logloss)
    summary = avg_logloss

    stack_test[:, :] = stack_test_model.mean(axis=2)

    return stack_train, stack_test, summary

def convert_to_ffm(df,type,numerics,categories,features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    with open(str(type) + "_ffm.txt", "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
             datastring = ""
             datarow = df.iloc[r].to_dict()
             datastring += str(int(datarow['target'])) # Set Target Variable here
            
             zero_feature_count = 0
            # For numerical fields, we are creating a dummy field here
             for i, x in enumerate(catdict.keys()):
                 if(catdict[x]==0):
                        if(datarow[x]!=0): # Sparse(X) - only use non-zero feature value
                            datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
#                             datastring = datastring + " "+str(i)+":"+ str(datarow[x])+":"+ str(datarow[x])
#                             datastring = datastring + " "+ str(i)+":"+ str(datarow[x])
                        else:
                            zero_feature_count += 1
                 else:

            # For a new field appearing in a training example
                     if(x not in catcodes):
                         catcodes[x] = {}
                         currentcode +=1
                         catcodes[x][datarow[x]] = currentcode #encoding the feature

            # For already encoded fields
                     elif(datarow[x] not in catcodes[x]):
                         currentcode +=1
                         catcodes[x][datarow[x]] = currentcode #encoding the feature

                     code = catcodes[x][datarow[x]]
                     datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"
             if zero_feature_count == 93:
                print('found')
                datastring = datastring + " 12:12:1 16:16:1 63:63:1"
#                 for k in range(ncolumns):
#                     datastring = datastring + " "+str(k)+":"+ str(k)+":"+ str(0)
             datastring += '\n'
             text_file.write(datastring)

def main():
    start_time = time.time()

    logging.basicConfig(level=logging.DEBUG,
                        format='[%(asctime)s]: %(message)s ',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        stream=sys.stdout,
                        filemode="w"
                        )

    # load data
    logging.info('Load data')
    train_df, test_df = load_data(train_data_path='train.csv', test_data_path='test.csv')
    features = train_df.drop('target', axis=1).columns
#     print(features)
#     print(train_df['target'])
    X = train_df.drop('target', axis=1).to_numpy()
    y = train_df['target'].to_numpy()
    X_submission = test_df.to_numpy()
    
    # Tune parameters
    lr_ = []
    Lambda_ = []
    k_ = []
    init_ = []
    mean_test_score = []
    
    params = {'lr': [0.01, 0.2, 0.5],
#          'lambda': [0.01, 0.002], 
         'lambda': [0.002],
#          'k': [2, 4, 8],
         'k': [4],
         'init': [0.80, 0.40, 0.10]}
#     params = {'lr': [0.5],
# #          'lambda': [0.01, 0.002], 
#          'Lambda': [0.000002, 0.01],
#          'k': [4],
# #          'k': [4],
#          'init': [0.66]}
    keys = list(params)
    for param in itertools.product(*map(params.get, keys)):
        print("lr: {}, Lambda: {}, k: {}, init: {} ".format(param[0], param[1], param[2], param[3]))
        _, _, summary = model_CV_train(param, X, y, X_submission, train_df, test_df, 9, n_folds=5)
        lr_.append(param[0])
        Lambda_.append(param[1])
        k_.append(param[2])
        init_.append(param[3])
#         summary = 0
        mean_test_score.append(summary)
    best = np.argmin(mean_test_score)  
    print("Best parameters: lr: {}, Lambda: {}, k: {}, init: {} \n mean_test_score: {} ".format(lr_[best], Lambda_[best], k_[best], init_[best], mean_test_score[best]))    
    
    # Train model using best parameters
#     param = {'lr': 0.5,
#          'Lambda': 0.000002,
#          'k': 4,
#          'init': 0.66}
    param = [0.5,
         0.000002,
         4,
         0.66]
#     print(param[0])
    train_models_pred, test_models_pred, summary = model_CV_train(param, X, y, X_submission, train_df, test_df, 9, n_folds=5)
    
    # Export predictions
    np.savetxt("model5_train.csv", train_models_pred, delimiter=",")
    np.savetxt("model5_test.csv", test_models_pred, delimiter=",")
    trainData= pd.read_csv("train.csv")
    correctClass=trainData['target']
    classes=np.unique(correctClass)
    testData= pd.read_csv("test.csv")
    testIDs=testData['id']
    submission = pd.DataFrame(data=test_models_pred,columns=classes)
    submission.insert(0,'id',testIDs)
    submission.to_csv("FFM.csv", index=False)
    
    end_time = time.time()
    logging.info("Run complete: %s elapsed" % elapsed_time(start_time, end_time))
    
if __name__ == '__main__':
    main()

[2020-03-07 00:33:22]: Load data 
found
  Fold 0
  Clf 0
  Clf 1
  Clf 2
  Clf 3
  Clf 4
  Clf 5
  Clf 6
  Clf 7
  Clf 8
  logloss: 0.528959
  Fold 1
  Clf 0
  Clf 1
  Clf 2
  Clf 3
  Clf 4
  Clf 5
  Clf 6
  Clf 7
  Clf 8
  logloss: 0.520290
  Fold 2
  Clf 0
  Clf 1
  Clf 2
  Clf 3
  Clf 4
  Clf 5
  Clf 6
  Clf 7
  Clf 8
  logloss: 0.529217
  Fold 3
  Clf 0
  Clf 1
  Clf 2
  Clf 3
  Clf 4
  Clf 5
  Clf 6
  Clf 7
  Clf 8
  logloss: 0.533642
  Fold 4
  Clf 0
  Clf 1
  Clf 2
  Clf 3
  Clf 4
  Clf 5
  Clf 6
  Clf 7
  Clf 8
  logloss: 0.522629
model average logloss: 0.526947
[2020-03-07 00:45:10]: Run complete: 0:11:48 elapsed 
