# Make Sparse Matrix for FM, WnD modeling
-----
-----
#### Make trainset for xlearn module's input.

- Using Titanic dataset. (https://www.kaggle.com/c/titanic)
- Original paper (Steffen Rendle, https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
- Basic Conceptual Description for FM in korean (http://yamalab.tistory.com/107)

-----
-----
### Step 1 : Prepare Titanic dataset
- make dataframe

In [1]:
import math
import pickle

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df1_path = "../dataset/titanic_dataset.csv"
df2_path = "../dataset/titanic_answer.csv"

In [3]:
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)
df = df1.append(df2)

In [4]:
def age_discretize(x):
    if x == np.nan:
        return '10'
    else:
        x = int(x)
        if x < 10:
            return '1'
        elif x < 20 and x >= 10:
            return '2'
        elif x < 30 and x >= 20:
            return '3'
        elif x < 40 and x >= 30:
            return '4'
        elif x < 50 and x >= 40:
            return '5'
        elif x < 60 and x >= 50:
            return '6'
        elif x < 70 and x >= 60:
            return '7'
        elif x < 80 and x >= 70:
            return '8'
        elif x < 90 and x >= 80:
            return '9'
        else:
            return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

In [5]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
df = df.dropna()

In [6]:
df['is_female'] = df['sex'].apply(lambda x: '1' if x == "female" else '0')
df['age'] = df['age'].apply(lambda x: age_discretize(x))
df['fare'] = df['fare'].apply(lambda x: fare_discretize(int(x)))

In [7]:
df['survived'] = df['survived'].astype('str')
df['pclass'] = df['pclass'].astype('str')
df['sibsp'] = df['sibsp'].astype('str')
df['parch'] = df['parch'].astype('str')

In [8]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,is_female
0,1,2,female,2,0,1,2,1
1,1,2,female,1,1,1,3,1
2,1,2,female,4,1,0,2,1
4,1,3,female,3,0,0,1,1
6,1,1,female,4,0,0,6,1


-----
-----
### Step 2 : Encoding categorical variables

- make dict

In [9]:
for col in df.columns:
    print("---------------")
    print(col)
    print(df[col].unique())

---------------
survived
['1' '0']
---------------
pclass
['2' '3' '1']
---------------
sex
['female' 'male']
---------------
age
['2' '1' '4' '3' '5' '6' '7' '8' '9']
---------------
sibsp
['0' '1' '2' '4' '3' '5' '8']
---------------
parch
['1' '0' '2' '5' '3' '4' '6']
---------------
fare
['2' '3' '1' '6' '10' '4' '8' '5' '9' '7']
---------------
is_female
['1' '0']


In [10]:
def make_encoding_label_dict(col_unique):
    encoded_dict = {}
    for idx, unique in enumerate(col_unique):
        encoded_dict[unique] = idx + 1

    return encoded_dict

In [11]:
encoded_pclass_dict = make_encoding_label_dict(df.pclass.value_counts().index.tolist())
encoded_age_dict = make_encoding_label_dict(df.age.value_counts().index.tolist())
encoded_sibsp_dict = make_encoding_label_dict(df.sibsp.value_counts().index.tolist())
encoded_parch_dict = make_encoding_label_dict(df.parch.value_counts().index.tolist())
encoded_fare_dict = make_encoding_label_dict(df.fare.value_counts().index.tolist())

In [12]:
encoded_pclass_dict

{'3': 1, '1': 2, '2': 3}

In [13]:
encoded_sibsp_dict

{'0': 1, '1': 2, '2': 3, '4': 4, '3': 5, '5': 6, '8': 7}

In [14]:
encoded_fare_dict

{'1': 1,
 '2': 2,
 '3': 3,
 '10': 4,
 '4': 5,
 '6': 6,
 '8': 7,
 '5': 8,
 '9': 9,
 '7': 10}

- variables encoding

In [15]:
def get_newcode(key, label_dict):
    com_len = len(label_dict)
    if key == np.nan:
        return
    else:
        if key in label_dict:
            return label_dict[key]
        else:
            return com_len + 1

In [16]:
df['pclass'] = df['pclass'].apply(lambda x: get_newcode(x, encoded_pclass_dict))
df['age'] = df['age'].apply(lambda x: get_newcode(x, encoded_age_dict))
df['sibsp'] = df['sibsp'].apply(lambda x: get_newcode(x, encoded_sibsp_dict))
df['parch'] = df['parch'].apply(lambda x: get_newcode(x, encoded_parch_dict))
df['fare'] = df['fare'].apply(lambda x: get_newcode(x, encoded_fare_dict))

In [17]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,is_female
0,1,3,female,3,1,2,2,1
1,1,3,female,5,2,2,3,1
2,1,3,female,2,2,1,2,1
4,1,1,female,1,1,1,1,1
6,1,2,female,2,1,1,6,1


-----
-----
### Step 3 : Make sparse dataframe

In [18]:
df_x = df[['pclass', 'is_female', 'age', 'sibsp', 'parch', 'fare']]
df_y = df['survived']
df_x = df_x.reset_index(drop=True)
df_y = df_y.reset_index(drop=True)

In [19]:
df_x.head()

Unnamed: 0,pclass,is_female,age,sibsp,parch,fare
0,3,1,3,1,2,2
1,3,1,5,2,2,3
2,3,1,2,2,1,2
3,1,1,1,1,1,1
4,2,1,2,1,1,6


In [20]:
col_len_dict = {'pclass': 3, 'sex': 1, 'age': 9, 'sibsp': 7, 'parch': 7, 'fare': 10}
col_accum_index_dict = {}
cumulative = 0
for key, value in col_len_dict.items():
    col_accum_index_dict[key] = cumulative
    cumulative = cumulative + value

In [21]:
out = np.zeros((df_x.shape[0], sum(col_len_dict.values())), dtype=float)

In [22]:
for idx, row in df_x.iterrows():
    for key, value in row.items():
        col_idx = 0
        out_val = 0
        if col_len_dict[key] == 1:
            col_idx = col_accum_index_dict[key]
            out_val = value
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
        out[idx, col_idx] = out_val

-----
-----
### Step 4(Optional) : Show results

In [23]:
columns = []
for key, value in col_len_dict.items():
    for i in range(1, value+1):
        columns.append(str(key) + "_" + str(i))

In [24]:
df_out = pd.DataFrame(out, columns=columns)

In [25]:
df_out.head()

Unnamed: 0,pclass_1,pclass_2,pclass_3,is_female_1,age_1,age_2,age_3,age_4,age_5,age_6,...,fare_1,fare_2,fare_3,fare_4,fare_5,fare_6,fare_7,fare_8,fare_9,fare_10
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


-----
-----
### Step 5(Optional) : Iterate with column index (for FM)

In [26]:
df_final = pd.concat([df_y, df_out], axis=1, ignore_index=True)
columns.insert(0, "y")
df_final.columns = columns
df_final.head()

Unnamed: 0,y,pclass_1,pclass_2,pclass_3,is_female_1,age_1,age_2,age_3,age_4,age_5,...,fare_1,fare_2,fare_3,fare_4,fare_5,fare_6,fare_7,fare_8,fare_9,fare_10
0,1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
for idx, row in df_final.sample(1).iterrows():
    print("0", row[0])
    for i in range(1, row.size):
        print(i, row[i])

0 0
1 0.0
2 0.0
3 1.0
4 0.0
5 0.0
6 1.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
12 0.0
13 0.0
14 0.0
15 1.0
16 0.0
17 0.0
18 0.0
19 0.0
20 0.0
21 1.0
22 0.0
23 0.0
24 0.0
25 0.0
26 0.0
27 0.0
28 0.0
29 0.0
30 1.0
31 0.0
32 0.0
33 0.0
34 0.0
35 0.0
36 0.0
37 0.0


In [28]:
for idx, row in df_final.sample(1).iterrows():
    print("0", row[0])
    for i in range(1, row.size):
        if row[i] == 0:
            continue
        else:
            print(i, row[i])

0 0
1 1.0
4 1.0
7 1.0
14 1.0
21 1.0
28 1.0
