In [2]:
import pandas as pandas
import numpy as numpy
import ffm
import _pickle as pickle
#import _pickle as pickle

In [3]:
from fastFM.mcmc import FMClassification, FMRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file

### Data fields
* Label - Target variable that indicates if an ad was clicked (1) or not (0).
* I1-I13 - A total of 13 columns of integer features (mostly count features).
* C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes.

In [4]:
train_data = pandas.read_csv('./train.tiny.csv')
test_data = pandas.read_csv('./test.tiny.csv')

### 資料格式
* df = Dataframe to be converted to ffm format
* Type = Train/Test/Val
* Numerics = list of all numeric fields
* Categories = list of all categorical fields
* Features = list of all features except the Label and Id

In [5]:
# Based on Kaggle kernel by Scirpus
def convert_to_ffm(df,type,numerics,categories,features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    with open(str(type) + "_ffm.txt", "w") as text_file:
# Looping over rows to convert each row to libffm format
        for n,r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow['Label']))
             # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if(catdict[x]==0):
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                else:
            # For a new field appearing in a training example
                    if(x not in catcodes):
                        catcodes[x] = {}
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
            # For already encoded fields
                    elif(datarow[x] not in catcodes[x]):
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
                    code = catcodes[x][datarow[x]]
                    datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

In [6]:
num_col_tr = train_data.iloc[:,2:15]
cat_col_tr = train_data.iloc[:,15:41]
#all_col_tr = train_data.drop(['Id','Label'],axis=1)

num_col_tr = pandas.DataFrame(num_col_tr.fillna(num_col_tr.mean()))
cat_col_tr = pandas.DataFrame(cat_col_tr.fillna(0))
all_col_tr = pandas.concat([num_col_tr,cat_col_tr],axis=1)

print (num_col_tr.head())
print ('\n')
print (cat_col_tr.head())
print ('\n')
print (all_col_tr.head())

         I1   I2         I3         I4      I5          I6    I7   I8     I9  \
0  1.000000    1   5.000000   0.000000  1382.0    4.000000  15.0  2.0  181.0   
1  2.000000    0  44.000000   1.000000   102.0    8.000000   2.0  2.0    4.0   
2  2.000000    0   1.000000  14.000000   767.0   89.000000   4.0  2.0  245.0   
3  3.696396  893  33.079355   8.062698  4392.0  146.925631   0.0  0.0    0.0   
4  3.000000   -1  33.079355   0.000000     2.0    0.000000   3.0  0.0    0.0   

        I10  I11       I12        I13  
0  1.000000  2.0  1.042697   2.000000  
1  1.000000  1.0  1.042697   4.000000  
2  1.000000  3.0  3.000000  45.000000  
3  0.561261  0.0  1.042697  11.784674  
4  1.000000  1.0  1.042697   0.000000  


         C1        C2        C3        C4        C5        C6        C7  \
0  68fd1e64  80e26c9b  fb936136  7b4723c4  25c83c98  7e0ccccf  de7995b8   
1  68fd1e64  f0cf0024  6f67f7e5  41274cd7  25c83c98  fe6b92e5  922afcc0   
2  287e684f  0a519c5c  02cf9876  c18be181  25c83c98 

In [7]:
num_col_te = test_data.iloc[:,2:15]
cat_col_te = test_data.iloc[:,15:41]
#all_col_te = test_data.drop(['Id','Label'],axis=1)

num_col_te = pandas.DataFrame(num_col_te.fillna(num_col_tr.mean()))
cat_col_te = pandas.DataFrame(cat_col_te.fillna(0))
all_col_te = pandas.concat([num_col_te,cat_col_te],axis=1)

print (num_col_te.shape)
print (cat_col_te.shape)
print (all_col_te.shape)

(1999, 13)
(1999, 26)
(1999, 39)


In [10]:
train_data_Label = pandas.concat([train_data.Label,all_col_tr],axis=1)
convert_to_ffm(train_data_Label,'Train',list(num_col_tr),list(cat_col_tr),list(all_col_tr))

test_data_Label = pandas.concat([test_data.Label,all_col_te],axis=1)
convert_to_ffm(test_data_Label,'Test',list(num_col_te),list(cat_col_te),list(all_col_te))

In [None]:
import xlearn as xl

### fm

In [None]:
fm_model = xl.create_fm() # Use field-aware factorization machine
fm_model.setTrain("Train_ffm.txt")  # Training data
fm_model.setValidate("Test_ffm.txt")  # Validation data
# param:
#  0. binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
param = {'task':'binary', 'lr':0.2, 'lambda':0.002,  'metric':'acc'}
# Train model
fm_model.fit(param, "./model_fm.out")

# Prediction task
fm_model.setTest("Test_ffm.txt")  # Test data
fm_model.setSigmoid()  # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
fm_model.predict("./model_fm.out", "./output_fm.txt")

### ffm

In [None]:
# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("Train_ffm.txt")  # Training data
ffm_model.setValidate("Test_ffm.txt")  # Validation data

# param:
#  0. binary classification
#  1. learning rate: 0.2
#  2. regular lambda: 0.002
#  3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2, 
         'lambda':0.002, 'metric':'acc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

# Prediction task
ffm_model.setTest("Test_ffm.txt")  # Test data
ffm_model.setSigmoid()  # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output_ffm.txt")

In [None]:
#! pip install git+https://github.com/coreylynch/pyFM
from pyfm import pylibfm

In [None]:
def fitpredict_logistic(trainX, trainY, testX, classification=True, **params):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = LogisticRegression(**params)
        clf.fit(trainX, trainY)
        return clf.predict_proba(testX)[:, 1]
    else:
        clf = Ridge(**params)
        clf.fit(trainX, trainY)
        return clf.predict(testX)

In [None]:
def fitpredict_libfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    train_file = 'libfm_train.txt'
    test_file = 'libfm_test.txt'
    with open(train_file, 'w') as f:
        dump_svmlight_file(trainX, trainY, f=f)
    with open(test_file, 'w') as f:
        dump_svmlight_file(testX, numpy.zeros(testX.shape[0]), f=f)
    task = 'c' if classification else 'r'
    console_output = !$LIBFM_PATH -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output.libfm
    
    libfm_pred = pandas.read_csv('output.libfm', header=None).values.flatten()
    return libfm_pred

In [None]:
def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        return clf.fit_predict(trainX, trainY, testX)

In [None]:
def fitpredict_pylibfm(trainX, trainY, testX, classification=True, rank=8, n_iter=10):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    task = 'classification' if classification else 'regression'
    fm = pylibfm.FM(num_factors=rank, num_iter=n_iter, verbose=False, task=task)
    if classification:
        fm.fit(trainX, trainY)
    else:
        fm.fit(trainX, trainY * 1.)
    return fm.predict(testX)

In [None]:
for col in train_data.columns:
    if(train_data[col].dtypes) != 'object':
        train_data.loc[:,col] = train_data.loc[:,col].fillna(0)
for col in test_data.columns:
    if(test_data[col].dtypes) != 'object':
        test_data.loc[:,col] = test_data.loc[:,col].fillna(0)

In [None]:
from sklearn.metrics import roc_auc_score, mean_squared_error
from fastFM.mcmc import FMClassification, FMRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
from pyfm import pylibfm
import sys
import _pickle as cPickle
from sklearn.metrics import roc_auc_score, mean_squared_error, classification_report

In [None]:
trainX = train_data.drop(['Id','Label'],axis = 1) 
trainY = train_data.Label
testX = test_data.drop(['Id','Label'],axis = 1) 
testY = test_data.Label

In [None]:
trainX_t = trainX.drop(cat_col_tr,axis = 1) 
trainY_t = train_data.Label
testX_t = testX.drop(cat_col_te,axis = 1) 
testY_t = test_data.Label

In [None]:
trainX_t = abs(trainX_t)
trainY_t = abs(trainY_t)
testX_t = abs(testX_t)
testY_t = abs(testY_t)

In [None]:
#http://arogozhnikov.github.io/2016/02/15/TestingLibFM.html
#Below is simple mechanism, which preserves results between runs.
from collections import OrderedDict
import time

all_results = OrderedDict()
try:
    with open('./saved_results.pkl') as f:
        all_results = pickle.load(f)
except:
    pass

def test_on_dataset(trainX, testX, trainY, testY, task_name, classification=True, use_pylibfm=True):
    algorithms = OrderedDict()
    algorithms['logistic'] = fitpredict_logistic
    algorithms['libFM']    = fitpredict_libfm
    algorithms['fastFM']   = fitpredict_fastfm
    if use_pylibfm:
        algorithms['pylibfm']  = fitpredict_pylibfm
    
    results = pandas.DataFrame()
    for name, fit_predict in algorithms.items():
        start = time.time()
        predictions = fit_predict(trainX, trainY, testX, classification=classification)
        spent_time = time.time() - start
        results.ix[name, 'time'] = spent_time
        if classification:
            results.ix[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
        else:
            results.ix[name, 'RMSE'] = numpy.mean((testY - predictions) ** 2) ** 0.5
            
    all_results[task_name] = results
    with open('saved_results.pkl', 'w') as f:
        pickle.dump(all_results, f)
        
    return results

In [None]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)
trainX.head()