In [153]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle as pkl
import sys

from fastFM import als
from fastFM.datasets import make_user_item_regression
from scipy.sparse import csc_matrix
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, LabelEncoder

# 06A - Factorization Machine - Data Preparation
While factorization machines are immensely powerful in their ability to make use of additional features, these features must be processed such that they are interpretable by an FM. Specifically, categorical features needed to be one-hot encoded, and features consisting of lists of categories (like **genre**, where an item can be both a western and a comedy) need to be similarly binarized. To prevent an explosion in data size, these features are converted into sparse matrices.

### Import data
Import the training, cross-validation, and test sets.

In [88]:
# import data
data_path = os.path.join('..', '..', 'data-2')
splits_path = os.path.join(data_path,'splits')
sparse_path = os.path.join(data_path,'sparse')
columns = ['user','item','rating']

with open(os.path.join(splits_path, 'train.df'), 'rb') as file_in:
    train_df = pkl.load(file_in)
    
with open(os.path.join(splits_path, 'dev.df'), 'rb') as file_in:
    cv_df = pkl.load(file_in)
    
with open(os.path.join(splits_path, 'test.df'), 'rb') as file_in:
    test_df = pkl.load(file_in)

### Additional cleaning
Scikit-learn's MultiLabelBinarizer converts between features consisting of lists of categories and a binary matrix indicating the presence of that category. In order for the MLB to function properly, all cells containing **None** must be converted to empty lists.

In [264]:
# replace 'None' with empty lists
train_df['genres_imdb'] = train_df['genres_imdb'].apply(lambda x: [] if not x else x)
cv_df['genres_imdb'] = cv_df['genres_imdb'].apply(lambda x: [] if not x else x)
test_df['genres_imdb'] = test_df['genres_imdb'].apply(lambda x: [] if not x else x)

train_df['studios_imdb'] = train_df['studios_imdb'].apply(lambda x: [] if not x else x)
cv_df['studios_imdb'] = cv_df['studios_imdb'].apply(lambda x: [] if not x else x)
test_df['studios_imdb'] = test_df['studios_imdb'].apply(lambda x: [] if not x else x)

train_df['directors'] = train_df['directors'].apply(lambda x: [] if not x else x)
cv_df['directors'] = cv_df['directors'].apply(lambda x: [] if not x else x)
test_df['directors'] = test_df['directors'].apply(lambda x: [] if not x else x)

# Convert data
The function below takes as input the three datasets, a target feature to be converted, a file path for saving, and a flag indicating whether the feature is to be one-hot encoded or converted using a MultiLabelBinarizer. In all cases, the data is converted to a sparse binary matrix in order to save space. All converted data is then stored into a dictionary (with 'train', cv', and 'test' as keys) so that the appropriate split can be easily accessed. Finally, the function saves the data into a specified file path.

In [202]:
def convert_data(train, cv, test, features, filename, multi = False):
    
    # subset data
    train = train[features]
    cv = cv[features]
    test = test[features]
    
    if multi:
        # multi-label binarizing
        mlb = MultiLabelBinarizer(sparse_output=True)
        X_train = mlb.fit_transform(train)
        X_cv = mlb.transform(cv)
        X_test = mlb.transform(test)
        
        data = dict(train=X_train, cv=X_cv, test=X_test, columns=mlb.classes_)
        
    else:
        # one-hot encoding
        X_train = pd.get_dummies(train)
        X_train_empty = X_train.iloc[0:0, :] # empty dataframe for merging purposes later
        X_train = csc_matrix(X_train)
        
        X_cv = pd.get_dummies(cv)
        # re-order X_cv columns so they match with those of X_train and fill missing values with 0
        X_cv = pd.concat([X_train_empty, X_cv], axis = 0).loc[:, X_train_empty.columns].fillna(0)
        X_cv = csc_matrix(X_cv)
        
        X_test = pd.get_dummies(test)
        # re-order X_cv columns so they match with those of X_train and fill missing values with 0
        X_test = pd.concat([X_train_empty, X_test], axis = 0).loc[:, X_train_empty.columns].fillna(0)
        X_test = csc_matrix(X_test)
        
        data = dict(train=X_train, cv=X_cv, test=X_test, columns=X_train_empty.columns)
        
    with open(os.path.join(sparse_path, filename + '.dict'), 'wb') as file_out:
        pkl.dump(data, file_out)
    
    return data

Below we convert and save all features that need to be represented as sparse matrices.

In [286]:
use_pretrained = True

if use_pretrained:
    with open(os.path.join(sparse_path, 'user-item.dict'), 'rb') as file_in:
        user_item = pkl.load(file_in)
else:
    user_item = convert_data(train_df, cv_df, test_df, ['user','item'], 'user-item', multi=False)

In [278]:
country = convert_data(train_df, cv_df, test_df, 'country', 'country', multi=True)
language = convert_data(train_df, cv_df, test_df, 'language', 'language', multi=True)
mpaa = convert_data(train_df, cv_df, test_df, 'mpaa_rating', 'mpaa', multi=False)
type = convert_data(train_df, cv_df, test_df, 'type', 'type', multi=False)
genres_imdb = convert_data(train_df, cv_df, test_df, 'genres_imdb', 'genres-imdb', multi=True)
genres_amazon = convert_data(train_df, cv_df, test_df, 'genres_amazon', 'genres-amazon', multi=True)
studios_imdb = convert_data(train_df, cv_df, test_df, 'studios_imdb', 'studios-imdb', multi=True)
studios_amazon = convert_data(train_df, cv_df, test_df, 'studios_amazon', 'studios-amazon', multi=True)
directors = convert_data(train_df, cv_df, test_df, 'directors', 'directors-imdb', multi=True)
actors = convert_data(train_df, cv_df, test_df, 'actors', 'actors', multi=True)