# [Quick Practice] 
-----
-----
## Learning Process - Factorization Machine
#### Using xlearn python-wrapper module (https://xlearn-doc.readthedocs.io/en/latest/)

- Dataset for quick practice is Titanic dataset. (https://www.kaggle.com/c/titanic)
- Original paper (Steffen Rendle, https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
- Basic Conceptual Description for FM in korean (http://yamalab.tistory.com/107)

-----
-----
### Step 1 : transform features (to FM's format)
- train/test split on dataframe

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_path = "../dataset/titanic_dataset.csv"
test_path = "../dataset/titanic_answer.csv"

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
train_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S,,"Cornwall / Akron, OH"
2,2,1,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba"
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q,,
4,3,1,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.725,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."


In [5]:
train_df = train_df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
test_df = test_df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]

In [6]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [7]:
train_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,2,female,13.0,0,1,19.5
1,1,2,female,4.0,1,1,23.0
2,1,2,female,30.0,1,0,13.8583
4,1,3,female,22.0,0,0,7.725
6,1,1,female,30.0,0,0,56.9292


-----
-----
- feature discretize

In [8]:
def age_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

In [9]:
train_df['sex'] = train_df['sex'].apply(lambda x: '1' if x == "female" else '0')
train_df['age'] = train_df['age'].apply(lambda x: age_discretize(int(x)))
train_df['fare'] = train_df['fare'].apply(lambda x: fare_discretize(int(x)))

test_df['sex'] = test_df['sex'].apply(lambda x: '1' if x == "female" else '0')
test_df['age'] = test_df['age'].apply(lambda x: age_discretize(int(x)))
test_df['fare'] = test_df['fare'].apply(lambda x: fare_discretize(int(x)))

In [10]:
train_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,2,1,2,0,1,2
1,1,2,1,1,1,1,3
2,1,2,1,4,1,0,2
4,1,3,1,3,0,0,1
6,1,1,1,4,0,0,6


In [11]:
train_df['survived'] = train_df['survived'].astype('str')
train_df['pclass'] = train_df['pclass'].astype('str')
train_df['sibsp'] = train_df['sibsp'].astype('str')
train_df['parch'] = train_df['parch'].astype('str')

test_df['survived'] = test_df['survived'].astype('str')
test_df['pclass'] = test_df['pclass'].astype('str')
test_df['sibsp'] = test_df['sibsp'].astype('str')
test_df['parch'] = test_df['parch'].astype('str')

- save as xlearn's fm input type
- -> label index_1:value_1 index_2:value_2 ... index_n:value_n

In [27]:
myfile = open('./dataset/train.txt', 'w')
for index, row in train_df.iterrows():
    line = [row['survived'], 
            row['pclass'] + ":" + "1", 
            row['sex'] + ":" + "1", 
            row['age'] + ":" + "1", 
            row['sibsp'] + ":" + "1",
            row['parch'] + ":" + "1",
            row['fare'] + ":" + "1"]
    myfile.write("%s\n" % " ".join(line))
myfile.close()

myfile = open('./dataset/test.txt', 'w')
for index, row in test_df.iterrows():
    line = [row['survived'], 
            row['pclass'] + ":" + "1", 
            row['sex'] + ":" + "1", 
            row['age'] + ":" + "1", 
            row['sibsp'] + ":" + "1",
            row['parch'] + ":" + "1",
            row['fare'] + ":" + "1"]
    myfile.write("%s\n" % " ".join(line))
myfile.close()

-----
-----
### Step 2 : Train Factorization Machine
- test accuracy/auc is just fine. process works well!

In [28]:
import xlearn as xl

def test():
    fm_model = xl.create_fm()

    train_path = './dataset/train.txt'
    test_path = './dataset/test.txt'


    fm_model.setTrain(train_path)
    fm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr':0.2,
             'lambda':0.002,
             'metric': 'auc'}

    # Start to train
    # The trained model will be stored in model.out
    fm_model.fit(param, './model.out')
    fm_model.setTXTModel('./model.txt')

    # Prediction task
    fm_model.setTest(test_path)  # Set the path of test dataset
    fm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    fm_model.predict("./model.out", "./output.txt")

In [29]:
test()

In [147]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [   10%      ]     1            0.591890            0.527873            0.812747                0.07
# [   20%      ]     2            0.570834            0.522338            0.816250                0.03
# [   30%      ]     3            0.566162            0.513488            0.818227                0.03
# [   40%      ]     4            0.563536            0.515130            0.819395                0.03
# [   50%      ]     5            0.561963            0.512888            0.819484                0.02
# [   60%      ]     6            0.560674            0.515076            0.819889                0.04
# [   70%      ]     7            0.560438            0.513704            0.819574                0.03
# [   80%      ]     8            0.558532            0.515051            0.815397                0.03
# [   90%      ]     9            0.559232            0.512932            0.814948                0.03

-----
-----
### Step 3 : FFM (Field aware Factorization Machine) Practice
- Using xlearn module, train ffm model same as fm.

In [156]:
train_path = "./dataset/titanic_dataset.csv"
test_path = "./dataset/titanic_answer.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df = train_df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
test_df = test_df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]

train_df = train_df.dropna()
test_df = test_df.dropna()

In [157]:
def age_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

In [158]:
train_df['sex'] = train_df['sex'].apply(lambda x: '1' if x == "female" else '0')
train_df['age'] = train_df['age'].apply(lambda x: age_discretize(int(x)))
train_df['fare'] = train_df['fare'].apply(lambda x: fare_discretize(int(x)))

test_df['sex'] = test_df['sex'].apply(lambda x: '1' if x == "female" else '0')
test_df['age'] = test_df['age'].apply(lambda x: age_discretize(int(x)))
test_df['fare'] = test_df['fare'].apply(lambda x: fare_discretize(int(x)))

train_df['survived'] = train_df['survived'].astype('str')
train_df['pclass'] = train_df['pclass'].astype('str')
train_df['sibsp'] = train_df['sibsp'].astype('str')
train_df['parch'] = train_df['parch'].astype('str')

test_df['survived'] = test_df['survived'].astype('str')
test_df['pclass'] = test_df['pclass'].astype('str')
test_df['sibsp'] = test_df['sibsp'].astype('str')
test_df['parch'] = test_df['parch'].astype('str')

-----
- input type is diffrent with fm or lm.
- -> label field_1:index_1:value_1 field_2:index_2:value_2 ...

In [164]:
field_dict = {"Financial": "0", 
              "Demography": "1", 
              "Family": "2"}

In [166]:
myfile = open('./dataset/train_ffm.txt', 'w')
for index, row in train_df.iterrows():
    line = [row['survived'], 
            field_dict['Financial'] + ":" + row['pclass'] + ":" + "1", 
            field_dict['Demography'] + ":" + row['sex'] + ":" + "1", 
            field_dict['Demography'] + ":" + row['age'] + ":" + "1", 
            field_dict['Family'] + ":" + row['sibsp'] + ":" + "1",
            field_dict['Family'] + ":" + row['parch'] + ":" + "1",
            field_dict['Financial'] + ":" + row['fare'] + ":" + "1"]
    myfile.write("%s\n" % " ".join(line))
myfile.close()

myfile = open('./dataset/test_ffm.txt', 'w')
for index, row in test_df.iterrows():
    line = [row['survived'], 
            field_dict['Financial'] + ":" + row['pclass'] + ":" + "1", 
            field_dict['Demography'] + ":" + row['sex'] + ":" + "1", 
            field_dict['Demography'] + ":" + row['age'] + ":" + "1", 
            field_dict['Family'] + ":" + row['sibsp'] + ":" + "1",
            field_dict['Family'] + ":" + row['parch'] + ":" + "1",
            field_dict['Financial'] + ":" + row['fare'] + ":" + "1"]
    myfile.write("%s\n" % " ".join(line))
myfile.close()

-----
-----
- train FFM model : it works well too. 
- FFM models work better than FM if the fields are clearly separated

In [176]:
import xlearn as xl

def test2():
    ffm_model = xl.create_ffm()

    train_path = './dataset/train_ffm.txt'
    test_path = './dataset/test_ffm.txt'


    ffm_model.setTrain(train_path)
    ffm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr':0.2,
             'lambda':0.002,
             'metric': 'auc', 
             'opt':'sgd'}

    # Start to train
    # The trained model will be stored in model.out
    ffm_model.fit(param, './model.out')
    ffm_model.setTXTModel('./model.txt')

    # Prediction task
    ffm_model.setTest(test_path)  # Set the path of test dataset
    ffm_model.setSigmoid()                 # Convert output to 0-1

    # Start to predict
    # The output result will be stored in output.txt
    ffm_model.predict("./model.out", "./output.txt")

In [177]:
test2()

In [171]:
# [   10%      ]     1            0.564825            0.511872            0.842548                0.06
# [   20%      ]     2            0.522519            0.466546            0.852205                0.03
# [   30%      ]     3            0.500908            0.459033            0.857999                0.03
# [   40%      ]     4            0.494133            0.455309            0.853553                0.04
# [   50%      ]     5            0.489826            0.452988            0.852430                0.03
# [   60%      ]     6            0.484961            0.456717            0.853463                0.03
# [   70%      ]     7            0.481642            0.446984            0.850319                0.03
# [   80%      ]     8            0.479982            0.448088            0.849196                0.03
# [   90%      ]     9            0.476195            0.447614            0.846815                0.03

-----
-----
### Step 4 : Hyper-paramter Tuning
- various tuning methods

In [196]:
import xlearn as xl

def test3():
    ffm_model = xl.create_ffm()

    train_path = './dataset/train_ffm.txt'
    test_path = './dataset/test_ffm.txt'


    ffm_model.setTrain(train_path)
    ffm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 30,
             'lr':0.2, # learning rate
             'lambda':0.002, # L2 parameter
             'metric': 'auc',
             'opt':'sgd', # optimizer
             'stop_window':3, # early-stopping window size
             'fold':3} # k-fold parameter
    
    # normalization off
    ffm_model.disableNorm()
    
    # k-fold cross-validation
    ffm_model.cv(param)

    # Start to train
    # The trained model will be stored in model.out
#     ffm_model.fit(param, './model.out')


In [197]:
test3() # it is fail because of disableNorm

In [190]:
# [------------] Epoch      Train log_loss       Test log_loss            Test AUC     Time cost (sec)
# [    3%      ]     1            1.690606            1.783437            0.764351                0.04
# [    6%      ]     2                 inf                 inf            0.532923                0.04
# [   10%      ]     3                 inf                 inf            0.409877                0.03
# [   13%      ]     4                 inf                 inf            0.470917                0.03
# [   16%      ]     5                 inf                 inf            0.533844                0.03
# [   20%      ]     6                 inf                 inf            0.500000                0.03
# [   23%      ]     7                 inf                 inf            0.500000                0.03
# [   26%      ]     8                 inf                 inf            0.500000                0.03
# [   30%      ]     9                 inf                 inf            0.500000                0.03
# [   33%      ]    10                 inf                 inf            0.500000                0.03
# [   36%      ]    11                 inf                 inf            0.500000                0.04
# [   40%      ]    12                 inf                 inf            0.500000                0.03
# [   43%      ]    13                 inf                 inf            0.500000                0.03
# [   46%      ]    14                 inf                 inf            0.500000                0.03
# [   50%      ]    15                 inf                 inf            0.500000                0.03
# [   53%      ]    16                 inf                 inf            0.500000                0.03
# [   56%      ]    17                 inf                 inf            0.500000                0.03
# [   60%      ]    18                 inf                 inf            0.500000                0.03
# [   63%      ]    19                 inf                 inf            0.500000                0.03
# [   66%      ]    20                 nan                 nan            0.500000                0.03
# [   70%      ]    21                 nan                 nan            0.500000                0.02
# [   73%      ]    22                 nan                 nan            0.500000                0.03
# [   76%      ]    23                 nan                 nan            0.500000                0.03
# [   80%      ]    24                 nan                 nan            0.500000                0.03
# [   83%      ]    25                 nan                 nan            0.500000                0.03
# [   86%      ]    26                 nan                 nan            0.500000                0.03
# [   90%      ]    27                 nan                 nan            0.500000                0.03
# [   93%      ]    28                 nan                 nan            0.500000                0.03
# [   96%      ]    29                 nan                 nan            0.500000                0.03
# [  100%      ]    30                 nan                 nan            0.500000                0.03