## Step 1 : transform features (to FM's format)
- train/test split on dataframe

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [21]:
df1_path = "../dataset/titanic_dataset.csv"
df2_path = "../dataset/titanic_answer.csv"

In [22]:
df1 = pd.read_csv(df1_path)
df2 = pd.read_csv(df2_path)
df = df1.append(df2)

In [23]:
print(df.shape)
df.head()

(1309, 13)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S,,"Cornwall / Akron, OH"
2,2,1,"Duran y More, Miss. Florentina",female,30.0,1,0,SC/PARIS 2148,13.8583,,C,,"Barcelona, Spain / Havana, Cuba"
3,3,0,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q,,
4,3,1,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.725,,Q,,"Kingwilliamstown, Co Cork, Ireland Glens Falls..."


In [24]:
def age_discretize(x):
    if x == np.nan:
        return '10'
    else:
        x = int(x)
        if x < 10:
            return '1'
        elif x < 20 and x >= 10:
            return '2'
        elif x < 30 and x >= 20:
            return '3'
        elif x < 40 and x >= 30:
            return '4'
        elif x < 50 and x >= 40:
            return '5'
        elif x < 60 and x >= 50:
            return '6'
        elif x < 70 and x >= 60:
            return '7'
        elif x < 80 and x >= 70:
            return '8'
        elif x < 90 and x >= 80:
            return '9'
        else:
            return '10'

def fare_discretize(x):
    if x < 10:
        return '1'
    elif x < 20 and x >= 10:
        return '2'
    elif x < 30 and x >= 20:
        return '3'
    elif x < 40 and x >= 30:
        return '4'
    elif x < 50 and x >= 40:
        return '5'
    elif x < 60 and x >= 50:
        return '6'
    elif x < 70 and x >= 60:
        return '7'
    elif x < 80 and x >= 70:
        return '8'
    elif x < 90 and x >= 80:
        return '9'
    else:
        return '10'

-----
- feature discretize

In [25]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
df = df.dropna()

In [26]:
df['sex'] = df['sex'].apply(lambda x: '1' if x == "female" else '0')
df['age'] = df['age'].apply(lambda x: age_discretize(x))
df['fare'] = df['fare'].apply(lambda x: fare_discretize(int(x)))

In [27]:
df['survived'] = df['survived'].astype('str')
df['pclass'] = df['pclass'].astype('str')
df['sibsp'] = df['sibsp'].astype('str')
df['parch'] = df['parch'].astype('str')

In [28]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,2,1,2,0,1,2
1,1,2,1,1,1,1,3
2,1,2,1,4,1,0,2
4,1,3,1,3,0,0,1
6,1,1,1,4,0,0,6


- feature encoding (feature's discretized value must be range 1 ~ n)

In [29]:
# make discretized label to 1~n dict
def make_encoding_label_dict(col_unique):
    encoded_dict = {}
    for idx, unique in enumerate(col_unique):
        encoded_dict[unique] = idx + 1

    return encoded_dict


# get new encoding result
def get_newcode(key, label_dict):
    com_len = len(label_dict)
    if key == np.nan:
        return
    else:
        if key in label_dict:
            return label_dict[key]
        else:
            return com_len + 1

In [30]:
encoded_sibsp_dict = make_encoding_label_dict(df.sibsp.value_counts().index.tolist())
encoded_parch_dict = make_encoding_label_dict(df.parch.value_counts().index.tolist())

In [31]:
df['sibsp'] = df['sibsp'].apply(lambda x: get_newcode(x, encoded_sibsp_dict))
df['parch'] = df['parch'].apply(lambda x: get_newcode(x, encoded_parch_dict))

In [32]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,2,1,2,1,2,2
1,1,2,1,1,2,2,3
2,1,2,1,4,2,1,2
4,1,3,1,3,1,1,1
6,1,1,1,4,1,1,6


In [33]:
df.sex.value_counts()

0    658
1    388
Name: sex, dtype: int64

-----
### save as xlearn's fm input type
###### output = label index_1:value_1 index_2:value_2 ... index_n:value_n

- make dicts for sparse marking

In [34]:
col_len_dict = {'pclass': 3, 'sex': 1, 'age': 9, 'sibsp': 7, 'parch': 7, 'fare': 10}
col_accum_index_dict = {}
cumulative = 0
for key, value in col_len_dict.items():
    col_accum_index_dict[key] = cumulative
    cumulative = cumulative + value

train_df, test_df = train_test_split(df, test_size=0.2)

- mark idx:val, with sparse matrix format

In [37]:
txt_file = open('../dataset/train.txt', 'w')
for idx, row in train_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()

In [38]:
txt_file = open('../dataset/test.txt', 'w')
for idx, row in test_df.iterrows():
    vec = []
    label = row['survived']
    vec.append(str(label))
    row = row.drop(labels=['survived'])
    for key, value in row.items():
        if col_len_dict[key] == 1:
            if value != '0':
                col_idx = col_accum_index_dict[key]
                out_val = value
                vec.append(str(col_idx) + ":" + str(out_val))
        else:
            col_idx = col_accum_index_dict[key] + (int(value) - 1)
            out_val = 1
            vec.append(str(col_idx) + ":" + str(out_val))
    txt_file.write("%s\n" % " ".join(vec))
txt_file.close()