## Reference
1. https://medium.com/@datadote/factorization-machines-pictures-code-pytorch-9fca1c300838

## Getting the dataset and unzipping

In [4]:
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip -P ../datasets/factorization_machine/
!unzip ../datasets/factorization_machine/ml-1m.zip -d ../datasets/factorization_machine/

--2024-08-09 20:19:45--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘../datasets/factorization_machine/ml-1m.zip’


2024-08-09 20:19:49 (2.26 MB/s) - ‘../datasets/factorization_machine/ml-1m.zip’ saved [5917549/5917549]



## Importing Libraries

In [133]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import numpy as np

## Loading Data

In [2]:
DATA_ROOT_FOLDER = "../datasets/factorization_machine/ml-1m/"
df_movies = pd.read_csv(DATA_ROOT_FOLDER+'movies.dat', sep='::',
                        names=['movieId', 'title','genres'],
                        encoding='latin-1',
                        engine='python')

user_cols = ['userId', 'gender' ,'age', 'occupation', 'zipcode']
df_users = pd.read_csv(DATA_ROOT_FOLDER+'users.dat', sep='::',
                       header=None,
                       names=user_cols,
                       engine='python')

df = pd.read_csv(DATA_ROOT_FOLDER+'ratings.dat', sep='::',
                 names=['userId','movieId','rating','time'],
                 engine='python')
# Left merge removes movies with no rating. # of unique movies: 3883 -> 3706
df = df.merge(df_movies, on='movieId', how='left')
df = df.merge(df_users, on='userId', how='left')
df = df.sort_values(['userId', 'time'], ascending=[True, True]).reset_index(drop=True)

df

Unnamed: 0,userId,movieId,rating,time,title,genres,gender,age,occupation,zipcode
0,1,3186,4,978300019,"Girl, Interrupted (1999)",Drama,F,1,10,48067
1,1,1270,5,978300055,Back to the Future (1985),Comedy|Sci-Fi,F,1,10,48067
2,1,1721,4,978300055,Titanic (1997),Drama|Romance,F,1,10,48067
3,1,1022,5,978300055,Cinderella (1950),Animation|Children's|Musical,F,1,10,48067
4,1,2340,3,978300103,Meet Joe Black (1998),Romance,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,2917,4,997454429,Body Heat (1981),Crime|Thriller,M,25,6,11106
1000205,6040,1921,4,997454464,Pi (1998),Sci-Fi|Thriller,M,25,6,11106
1000206,6040,1784,3,997454464,As Good As It Gets (1997),Comedy|Drama,M,25,6,11106
1000207,6040,161,3,997454486,Crimson Tide (1995),Drama|Thriller|War,M,25,6,11106


In [3]:
encoder_dictionary = {}
for cat_col in ['userId', 'movieId', 'gender', 'age', 'occupation']:
    encoder = LabelEncoder()
    encoder.fit(df[cat_col].unique())
    df[cat_col+"_index"] = encoder.transform(df[cat_col])
    encoder_dictionary[cat_col] = encoder
    print(f'# unique {cat_col}: {len(encoder.classes_)}')


min_num_ratings = df.groupby(['userId'])['userId'].transform(len).min()
print(f'Min # of ratings per user: {min_num_ratings}')
print(f'Min/Max rating: {df.rating.min()} / {df.rating.max()}')
print(f'df.shape: {df.shape}')

# unique userId: 6040
# unique movieId: 3706
# unique gender: 2
# unique age: 7
# unique occupation: 21
Min # of ratings per user: 20
Min/Max rating: 1 / 5
df.shape: (1000209, 15)


In [4]:
# To use 1 embedding matrix, need to calculate & add offsets to each feature column
# Orig. paper uses 1-hot encoding, here we use ordinal encoding
# Ordinal encoding reduces memory size. Important for train speed
feature_cols = ['userId_index', 'movieId_index', 'gender_index', 'age_index',
                'occupation_index']
# Get offsets
feature_sizes = {}
for feat in feature_cols:
    feature_sizes[feat] = len(df[feat].unique())
feature_offsets = {}
NEXT_OFFSET = 0
for k,v in feature_sizes.items():
    feature_offsets[k] = NEXT_OFFSET
    NEXT_OFFSET += v

# Add offsets to each feature column
for col in feature_cols:
    df[col] = df[col].apply(lambda x: x + feature_offsets[col])
print('Offset - feature')
for k, os in feature_offsets.items():
    print(f'{os:<6} - {k}')

Offset - feature
0      - userId_index
6040   - movieId_index
9746   - gender_index
9748   - age_index
9755   - occupation_index


In [5]:
THRES = 5
cols = ['rating', *feature_cols]
df_train = df[cols].groupby('userId_index').head(15).reset_index(drop=True)
df_val = df[cols].groupby('userId_index').tail(THRES).reset_index(drop=True)
print(f'df_train shape: {df_train.shape}')
print(f'df_val shape: {df_val.shape}')
df_train.head(3)

df_train shape: (90600, 6)
df_val shape: (30200, 6)


Unnamed: 0,rating,userId_index,movieId_index,gender_index,age_index,occupation_index
0,4,0,9009,9746,9748,9765
1,5,0,7218,9746,9748,9765
2,4,0,7614,9746,9748,9765


## Creating dataset and dataloader

In [82]:
class MovieDataSet(Dataset):
    def __init__(self, df, x_feats, y_feat):
        super().__init__()
        self.df = df
        self.x_feats = df[x_feats].values
        self.y_rating = df[y_feat].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.x_feats[idx], self.y_rating[idx]

BATCH_SIZE = 1024
ds_train = MovieDataSet(df_train, feature_cols, 'rating')
ds_val = MovieDataSet(df_val, feature_cols, 'rating')
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

## Creating the model

In [85]:
next(iter(dl_val))[0].shape

torch.Size([1024, 5])

In [145]:
class FMModel(nn.Module):
    def __init__(self, num_feats, emb_dim):
        super().__init__()
        self.embeddings = nn.Embedding(num_feats, emb_dim)
        self.bias = nn.Parameter(torch.zeros(num_feats))
        self.offset = nn.Parameter(torch.zeros(1))
        nn.init.xavier_normal_(self.embeddings.weight.data)
        
    def forward(self,x):
        x_emb = self.embeddings(x) # (BATCH, feature_length, embeddings)
        pow_of_sum = x_emb.sum(dim=1).pow(2) # (BATCH, embeddings)
        sum_of_pow = x_emb.pow(2).sum(dim=1) # (BATCH, embeddings)
        fm_out = (pow_of_sum - sum_of_pow).sum(1)*0.5
        x_biases = self.bias[x].sum(1) # ?????
        fm_out += x_biases + self.offset
        return self.sigmoid_range(fm_out, low=0.5)
    
    def sigmoid_range(self, x, low=0, high=5.5):
        """ Sigmoid function with range (low, high) """
        return torch.sigmoid(x) * (high-low) + low
    
    


In [129]:
device = 'cpu'

In [149]:
CFG = {
    'lr': 0.001,
    'num_epochs': 15,
    'weight_decay': 0.01,
}
n_feats = int(pd.concat([df_train, df_val]).max().max())
n_feats = n_feats + 1 # "+ 1" to account for 0 - indexing
model = FMModel(n_feats, emb_dim=8)
model.to(device)
opt = optim.AdamW(model.parameters(), lr=CFG['lr'], weight_decay=CFG['weight_decay'])
loss_fn = nn.MSELoss()

In [150]:
epoch_train_losses, epoch_val_losses = [], []
for i in range(CFG["num_epochs"]):
    train_losses, val_losses = [], []
    model.train()
    for xb, yb in dl_train:
        xb, yb = xb.to(device), yb.to(device, dtype=torch.float)
        logits = model(xb)
        loss = loss_fn(logits, yb) # input, true
        train_losses.append(loss.item())
        opt.zero_grad()
        loss.backward()
        opt.step()
        
    model.eval()
    with torch.no_grad():
        for xb, yb in dl_val:
            b, yb = xb.to(device), yb.to(device, dtype=torch.float)
            preds = model(xb)
            loss = loss_fn(preds,yb)
            val_losses.append(loss.item())
            
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)    
    epoch_train_losses.append(epoch_train_loss)
    epoch_val_losses.append(epoch_val_loss)
    s = (f'Epoch: {i}, Train Loss: {epoch_train_loss:0.2f}, '
         f'Val Loss: {epoch_val_loss:0.2f}'
        )
    print(s)

Epoch: 0, Train Loss: 1.43, Val Loss: 1.24
Epoch: 1, Train Loss: 1.07, Val Loss: 1.16
Epoch: 2, Train Loss: 0.97, Val Loss: 1.10
Epoch: 3, Train Loss: 0.91, Val Loss: 1.07
Epoch: 4, Train Loss: 0.87, Val Loss: 1.06
Epoch: 5, Train Loss: 0.84, Val Loss: 1.05
Epoch: 6, Train Loss: 0.82, Val Loss: 1.05
Epoch: 7, Train Loss: 0.81, Val Loss: 1.05
Epoch: 8, Train Loss: 0.80, Val Loss: 1.05
Epoch: 9, Train Loss: 0.79, Val Loss: 1.06
Epoch: 10, Train Loss: 0.79, Val Loss: 1.06
Epoch: 11, Train Loss: 0.78, Val Loss: 1.06
Epoch: 12, Train Loss: 0.78, Val Loss: 1.06
Epoch: 13, Train Loss: 0.78, Val Loss: 1.07
Epoch: 14, Train Loss: 0.77, Val Loss: 1.07
