# [Quick Practice] 
-----
-----
## Post-Analysis Process - Factorization Machine
#### Using xlearn python-wrapper module (https://xlearn-doc.readthedocs.io/en/latest/)

- Using Titanic dataset. (https://www.kaggle.com/c/titanic)
- Original paper (Steffen Rendle, https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
- Basic Conceptual Description for FM in korean (http://yamalab.tistory.com/107)

-----
-----
## Step 1 : prepare FM model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df1_path = "./dataset/titanic_dataset.csv"
df2_path = "./dataset/titanic_answer.csv"

In [3]:
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)
df = df1.append(df2)

In [4]:
def age_discretize(x):
    if x == np.nan:
        return '10'
    else:
        x = int(x)
        if x < 10:
            return '1'
        elif x < 20 and x >= 10:
            return '2'
        elif x < 30 and x >= 20:
            return '3'
        elif x < 40 and x >= 30:
            return '4'
        elif x < 50 and x >= 40:
            return '5'
        elif x < 60 and x >= 50:
            return '6'
        elif x < 70 and x >= 60:
            return '7'
        elif x < 80 and x >= 70:
            return '8'
        elif x < 90 and x >= 80:
            return '9'
        else:
            return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

In [5]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
df = df.dropna()

df['sex'] = df['sex'].apply(lambda x: '1' if x == "female" else '0')
df['age'] = df['age'].apply(lambda x: age_discretize(x))
df['fare'] = df['fare'].apply(lambda x: fare_discretize(int(x)))

df['survived'] = df['survived'].astype('str')
df['pclass'] = df['pclass'].astype('str')
df['sibsp'] = df['sibsp'].astype('str')
df['parch'] = df['parch'].astype('str')

In [6]:
# make discretized label to 1~n dict
def make_encoding_label_dict(col_unique):
    encoded_dict = {}
    for idx, unique in enumerate(col_unique):
        encoded_dict[unique] = idx + 1

    return encoded_dict


# get new encoding result
def get_newcode(key, label_dict):
    com_len = len(label_dict)
    if key == np.nan:
        return
    else:
        if key in label_dict:
            return label_dict[key]
        else:
            return com_len + 1

In [7]:
encoded_sibsp_dict = make_encoding_label_dict(df.sibsp.value_counts().index.tolist())
encoded_parch_dict = make_encoding_label_dict(df.parch.value_counts().index.tolist())

In [8]:
df['sibsp'] = df['sibsp'].apply(lambda x: get_newcode(x, encoded_sibsp_dict))
df['parch'] = df['parch'].apply(lambda x: get_newcode(x, encoded_parch_dict))

In [9]:
col_len_dict = {'pclass': 3, 'sex': 1, 'age': 9, 'sibsp': 7, 'parch': 7, 'fare': 10}
col_accum_index_dict = {}
cumulative = 0
for key, value in col_len_dict.items():
    col_accum_index_dict[key] = cumulative
    cumulative = cumulative + value

train_df, test_df = train_test_split(df, test_size=0.2)

In [10]:
txt_file = open('./dataset/train.txt', 'w')
for idx, row in train_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

In [11]:
txt_file = open('./dataset/test.txt', 'w')
for idx, row in test_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

-----
-----
## Step 2 : learn FM model

In [18]:
import xlearn as xl

def runner():
    fm_model = xl.create_fm()

    train_path = './dataset/train.txt'
    test_path = './dataset/test.txt'

    fm_model.setTrain(train_path)
    fm_model.setValidate(test_path)

    # Parameters:
    param = {'task':'binary',
             'epoch': 10,
             'lr': 0.2,
             'lambda': 0.002,
             'metric': 'auc'}

    # Start to train : fit이 seTXTModel 보다 뒤에 있어야만 model.txt가 생성됨.
    fm_model.setTXTModel('./model.txt')
    fm_model.fit(param, './model.out')

    # Prediction task
    fm_model.setTest(test_path)
    fm_model.setSigmoid()

    # Start to predict
    fm_model.predict("./model.out", "./output.txt")

In [19]:
runner()

-----
-----
## Step 3 : Analyze FM's vectors

In [21]:
txt_path = './model.txt'

w = list()
v = list()
with open(txt_path, 'r') as my_file:
    lines = my_file.readlines()
    for line in lines:
        cut_line = line.strip().split(' ')
        if len(cut_line) == 1:
            w.append(cut_line[0])
        elif len(cut_line) > 1:
            v.append(cut_line)

w = np.array(w).astype(np.float)
v = np.array(v).astype(np.float)

v_r = np.array(v).astype(np.float).ravel()
v_ij = np.matmul(v, v.T).ravel()  

ValueError: setting an array element with a sequence.