# [Quick Practice] 
-----
-----
## Learning Process - Factorization Machine
#### Using xlearn python-wrapper module (https://xlearn-doc.readthedocs.io/en/latest/)

- Using Titanic dataset. (https://www.kaggle.com/c/titanic)
- Original paper (Steffen Rendle, https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
- Basic Conceptual Description for FM in korean (http://yamalab.tistory.com/107)

-----
-----
### Step 1 : transform features (to FM's format)
- train/test split on dataframe

In [314]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [315]:
df1_path = "./dataset/titanic_dataset.csv"
df2_path = "./dataset/titanic_answer.csv"

In [316]:
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)
df = df1.append(df2)

In [317]:
print(df.shape)
df.head()

(1309, 13)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S,,"Cornwall / Akron, OH"
2,2,1,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba"
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q,,
4,3,1,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.725,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."


In [318]:
def age_discretize(x):
    if x == np.nan:
        return '10'
    else:
        x = int(x)
        if x < 10:
            return '1'
        elif x < 20 and x >= 10:
            return '2'
        elif x < 30 and x >= 20:
            return '3'
        elif x < 40 and x >= 30:
            return '4'
        elif x < 50 and x >= 40:
            return '5'
        elif x < 60 and x >= 50:
            return '6'
        elif x < 70 and x >= 60:
            return '7'
        elif x < 80 and x >= 70:
            return '8'
        elif x < 90 and x >= 80:
            return '9'
        else:
            return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

-----
-----
- feature discretize

In [319]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]

In [320]:
df = df.dropna()

In [321]:
df['sex'] = df['sex'].apply(lambda x: '1' if x == "female" else '0')
df['age'] = df['age'].apply(lambda x: age_discretize(x))
df['fare'] = df['fare'].apply(lambda x: fare_discretize(int(x)))

In [322]:
df['survived'] = df['survived'].astype('str')
df['pclass'] = df['pclass'].astype('str')
df['sibsp'] = df['sibsp'].astype('str')
df['parch'] = df['parch'].astype('str')

In [323]:
df_x = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
df_y = df['survived']

In [324]:
# df_x = pd.get_dummies(df_x)
df_x = pd.get_dummies(df_x, prefix=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare'], 
                      columns=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare'])
df = pd.concat([df_y, df_x], axis=1, ignore_index=True)

train_df, test_df = train_test_split(df, test_size=0.2)

- save as xlearn's fm input type
- -> label index_1:value_1 index_2:value_2 ... index_n:value_n

In [325]:
myfile = open('./dataset/train.txt', 'w')
for row in train_df.itertuples():
    line = []
    line.append(str(row[1]))
    for i in range(2, len(row)):
        line.append(str(i-1) + ":" + str(row[i]))
    myfile.write("%s\n" % " ".join(line))
myfile.close()

myfile = open('./dataset/test.txt', 'w')
for row in test_df.itertuples():
    line = []
    line.append(str(row[1]))
    for i in range(2, len(row)):
        line.append(str(i-1) + ":" + str(row[i]))
    myfile.write("%s\n" % " ".join(line))
myfile.close()

-----
-----
### Step 2 : Train Factorization Machine
- test accuracy/auc is just fine. process works well!

In [326]:
import xlearn as xl

def test():
    fm_model = xl.create_fm()

    train_path = './dataset/train.txt'
    test_path = './dataset/test.txt'


    fm_model.setTrain(train_path)
    fm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr':0.2,
             'lambda':0.002,
             'metric': 'auc'}

    # Start to train
    # The trained model will be stored in model.out
    fm_model.fit(param, './model.out')
    fm_model.setTXTModel('./model.txt')

    # Prediction task
    fm_model.setTest(test_path)  # Set the path of test dataset
    fm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    fm_model.predict("./model.out", "./output.txt")

In [327]:
test()

In [328]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [   10%      ]     1            0.553290            0.496945            0.823581                0.06
# [   20%      ]     2            0.500396            0.479406            0.825238                0.03
# [   30%      ]     3            0.488841            0.478167            0.824928                0.03
# [   40%      ]     4            0.483605            0.466880            0.826170                0.03
# [   50%      ]     5            0.481843            0.469989            0.821613                0.03
# [   60%      ]     6            0.479052            0.466899            0.821303                0.03
# [   70%      ]     7            0.477003            0.472549            0.817471

-----
-----
### Step 3 : FFM (Field aware Factorization Machine) Practice
- Using xlearn module, train ffm model same as fm.

In [329]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
135,0,0,0,1,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
171,0,0,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
639,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
784,0,0,0,1,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
388,0,0,0,1,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [330]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
252,1,0,1,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
833,1,0,0,1,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
715,1,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
6,1,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
175,0,0,0,1,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [331]:
df_x.columns

Index(['pclass_1', 'pclass_2', 'pclass_3', 'sex_0', 'sex_1', 'age_1', 'age_2',
       'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8', 'age_9',
       'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5',
       'sibsp_8', 'parch_0', 'parch_1', 'parch_2', 'parch_3', 'parch_4',
       'parch_5', 'parch_6', 'fare_1', 'fare_10', 'fare_2', 'fare_3', 'fare_4',
       'fare_5', 'fare_6', 'fare_7', 'fare_8', 'fare_9'],
      dtype='object')

-----
- input type is diffrent with fm or lm.
- -> label field_1:index_1:value_1 field_2:index_2:value_2 ...

In [332]:
col_names = df_x.columns.tolist()
field_dict = {"Financial": "0", 
              "Demography": "1", 
              "Family": "2"}
mapping_dict = {"pclass": "Financial", "fare": "Financial", 
                "sex": "Demography", "age": "Demography", 
                "sibsp": "Family", "parch": "Family"}

In [333]:
col_names[0]

'pclass_1'

In [334]:
myfile = open('./dataset/train_ffm.txt', 'w')
for row in train_df.itertuples():
    line = []
    line.append(str(row[1]))
    for i in range(2, len(row)):
        field_value = field_dict[mapping_dict[col_names[i-2].split("_")[0]]]
        line.append(field_value + ":" + str(i-1) + ":" + str(row[i]))
    myfile.write("%s\n" % " ".join(line))
myfile.close()

myfile = open('./dataset/test_ffm.txt', 'w')
for row in test_df.itertuples():
    line = []
    line.append(str(row[1]))
    for i in range(2, len(row)):
        field_value = field_dict[mapping_dict[col_names[i-2].split("_")[0]]]
        line.append(field_value + ":" + str(i-1) + ":" + str(row[i]))
    myfile.write("%s\n" % " ".join(line))
myfile.close()

-----
-----
- train FFM model : it works well too. 
- FFM models work better than FM if the fields are clearly separated

In [335]:
import xlearn as xl

def test2():
    ffm_model = xl.create_ffm()

    train_path = './dataset/train_ffm.txt'
    test_path = './dataset/test_ffm.txt'


    ffm_model.setTrain(train_path)
    ffm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr':0.2,
             'lambda':0.002,
             'metric': 'auc', 
             'opt':'sgd'}

    # Start to train
    # The trained model will be stored in model.out
    ffm_model.fit(param, './model.out')
    ffm_model.setTXTModel('./model.txt')

    # Prediction task
    ffm_model.setTest(test_path)  # Set the path of test dataset
    ffm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    ffm_model.predict("./model.out", "./output.txt")

In [336]:
test2()

In [337]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [   10%      ]     1            0.550938            0.460217            0.860457                0.06
# [   20%      ]     2            0.508593            0.447501            0.860558                0.03
# [   30%      ]     3            0.489172            0.514961            0.863678                0.03
# [   40%      ]     4            0.492407            0.507981            0.855525                0.03
# [   50%      ]     5            0.489974            0.433608            0.864281                0.03
# [   60%      ]     6            0.489920            0.451084            0.858746                0.03
# [   70%      ]     7            0.496275            0.453510            0.859551                0.03
# [   80%      ]     8            0.494443            0.451638            0.859853                0.03
# [   90%      ]     9            0.489285            0.445428            0.857538                0.03
# [  100%      ]    10            0.493942            0.441266            0.862470                0.03

-----
-----
### Step 4 : Hyper-paramter Tuning
- various tuning methods

In [338]:
import xlearn as xl

def test3():
    ffm_model = xl.create_ffm()

    train_path = './dataset/train_ffm.txt'
    test_path = './dataset/test_ffm.txt'


    ffm_model.setTrain(train_path)
    ffm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 30,
             'lr':0.2, # learning rate
             'lambda':0.002, # L2 parameter
             'metric': 'auc',
             'opt':'sgd', # optimizer
             'stop_window':3, # early-stopping window size
             'fold':3} # k-fold parameter
    
    # normalization off
    ffm_model.disableNorm()
    
    # k-fold cross-validation
    ffm_model.cv(param)

    # Start to train
    # The trained model will be stored in model.out
#     ffm_model.fit(param, './model.out')


In [339]:
test3() # it is fail because of disableNorm

In [340]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [    3%      ]     1            1.690606            1.783437            0.764351                0.04
# [    6%      ]     2                 inf                 inf            0.532923                0.04
# [   10%      ]     3                 inf                 inf            0.409877                0.03
# [   13%      ]     4                 inf                 inf            0.470917                0.03
# [   16%      ]     5                 inf                 inf            0.533844                0.03
# [   20%      ]     6                 inf                 inf            0.500000                0.03
# [   23%      ]     7                 inf                 inf            0.500000                0.03
# [   26%      ]     8                 inf                 inf            0.500000                0.03
# [   30%      ]     9                 inf                 inf            0.500000                0.03
# [   33%      ]    10                 inf                 inf            0.500000                0.03
# [   36%      ]    11                 inf                 inf            0.500000                0.04
# [   40%      ]    12                 inf                 inf            0.500000                0.03
# [   43%      ]    13                 inf                 inf            0.500000                0.03
# [   46%      ]    14                 inf                 inf            0.500000                0.03
# [   50%      ]    15                 inf                 inf            0.500000                0.03
# [   53%      ]    16                 inf                 inf            0.500000                0.03
# [   56%      ]    17                 inf                 inf            0.500000                0.03
# [   60%      ]    18                 inf                 inf            0.500000                0.03
# [   63%      ]    19                 inf                 inf            0.500000                0.03
# [   66%      ]    20                 nan                 nan            0.500000                0.03
# [   70%      ]    21                 nan                 nan            0.500000                0.02
# [   73%      ]    22                 nan                 nan            0.500000                0.03
# [   76%      ]    23                 nan                 nan            0.500000                0.03
# [   80%      ]    24                 nan                 nan            0.500000                0.03
# [   83%      ]    25                 nan                 nan            0.500000                0.03
# [   86%      ]    26                 nan                 nan            0.500000                0.03
# [   90%      ]    27                 nan                 nan            0.500000                0.03
# [   93%      ]    28                 nan                 nan            0.500000                0.03
# [   96%      ]    29                 nan                 nan            0.500000                0.03
# [  100%      ]    30                 nan                 nan            0.500000                0.03