<a href="https://colab.research.google.com/github/yliang412/restaurant-recommender/blob/main/restaurant_recommender_mf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Restaurant Recommender - Matrix Factorization

## Setup

In [1]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

# Replace this with the path in your drive if you created the shortcut somewhere else
path_in_my_drive = 'gdrive/My Drive/ml-final-project(mzhang3,yuchenl3,kehaoc)'

# # Uncomment if you need the original dataset
# !unzip 'gdrive/My Drive/ml-final-project(mzhang3,yuchenl3,kehaoc)/data/yelp-dataset.zip' -d 'data'

# Load the processed data frames
!cp -r 'gdrive/My Drive/ml-final-project(mzhang3,yuchenl3,kehaoc)/processed_data' .

Mounted at /content/gdrive


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import train_test_split

## Loading Data

In [3]:
def load_yelp_df(table_name: str, nrows=None):
    """Load an original yelp data frame."""

    chunks = pd.read_json(f'data/yelp_academic_dataset_{table_name}.json', lines=True, chunksize=100000, nrows=nrows)
    return pd.concat(chunks)


def load_processed_df(table_name: str, from_gdrive=False):
    """Load a processed data frame."""
    
    path = f'{path_in_my_drive}/processed_data/{table_name}.parquet.gzip' if from_gdrive \
        else f'processed_data/{table_name}.parquet.gzip'
    return pd.read_parquet(path)


def save_processed_df(df: pd.DataFrame, table_name: str):
    """Save a processed dataframe to google drive."""

    path = f'{path_in_my_drive}/processed_data/{table_name}.parquet.gzip'
    with open(path, 'wb') as f:
        df.to_parquet(f, compression='gzip')

In [4]:
def ratings_preprocess(nrows=None):
    """Preprocess the ratings data frame to be used for Matrix Factorization."""

    rating_df = load_processed_df('user_restaurant_rating')
    if nrows:
        rating_df = rating_df.loc[1:nrows, :]
    
    rating_df["user_id"] = rating_df["user_id"].astype("category")
    rating_df["business_id"] = rating_df["business_id"].astype("category")
    rating_df["stars"] = rating_df["stars"].astype("float32")

    cat_columns = rating_df.select_dtypes(['category']).columns
    rating_df[cat_columns] = rating_df[cat_columns].apply(lambda x: x.cat.codes)
    rating_numeric_df = rating_df[['user_id', 'business_id', 'stars']]
    return rating_numeric_df

def ratings_train_test_split(rating_numeric_df, test_size=0.2):
    """Create two disjoint matrix based on the user id."""

    all_users = rating_numeric_df['user_id'].unique()
    train_users, test_users = train_test_split(all_users, test_size=test_size)

    n_users = all_users.shape[0]
    n_restaurants = rating_numeric_df['business_id'].unique().shape[0]

    train_mat = sp.csr_matrix((rating_numeric_df['stars'], (rating_numeric_df['user_id'], rating_numeric_df['business_id'])), shape=(n_users, n_restaurants))
    test_mat = train_mat.copy()
    train_mat[train_users] = 0
    train_mat.eliminate_zeros()
    train_mat = train_mat.tocoo()
    test_mat[test_users] = 0
    test_mat.eliminate_zeros()
    test_mat = test_mat.tocoo()
    return train_users, train_mat, test_users, test_mat

In [5]:
class YelpRatingsDataset(Dataset):
    
    def __init__(self, mat):
        """Yelp dataset initializer.

        Arguments:
            - mat: user-restaurant rating matrix.
        """
        self.mat = mat

    def __len__(self):
        """Returns the length of the YelpRatingsDataset object.
        
        Returns:
            - length: number of unique user-restaurant pairs in the dataset.
        """

        return self.mat.nnz

    def __getitem__(self, index):
        """Returns a user-restaurant pair and the rating of the restaurant given by the user.
        
        Arguments:
            - index: the index of the sample, must be in [0, self.__len__()).
        Returns:
            - user_index: user index in the sample.
            - restaurant_index: restaurant index in the sample
            - rating: the rating of the restaurant given by the user.
        """

        user_index = self.mat.row[index]
        restaurant_index = self.mat.col[index]
        rating = self.mat.data[index]

        return (user_index, restaurant_index), rating

## Training and Validation

In [6]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [7]:
class RecommendationPipeline:
    
    def __init__(self, mat_shape, n_factors=20, lr=1e-6):
        n_users, n_items = mat_shape
        self.model =  MatrixFactorization(n_users, n_items, n_factors=n_factors)
        self.loss_func = torch.nn.MSELoss()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)
       
    
    def train(self, dl):
        for (user_id, restaurant_id), rating in dl:
            # Set gradients to zero
            self.optimizer.zero_grad()

            # Predict and calculate loss
            prediction = self.model(user_id, restaurant_id)
            loss = self.loss_func(prediction, rating)

            # Backpropagate
            loss.backward()

            # Update the parameters
            self.optimizer.step()
    
    def model(self):
        return self.model

    
    def validate(self, dl):
        losses = []
        for (user_id, restaurant_id), rating in dl:
            # Predict and calculate loss
            prediction = self.model(user_id, restaurant_id)
            loss = self.loss_func(prediction, rating)
            # TODO: calculate like the average?
            losses.append(loss)

        return torch.Tensor(losses)

In [12]:
# Full pipeline
rating_numeric_df = ratings_preprocess(nrows=10_000)
train_users, train_mat, test_users, test_mat = ratings_train_test_split(rating_numeric_df, test_size=0.2)
train_dl, test_dl = DataLoader(YelpRatingsDataset(train_mat)), DataLoader(YelpRatingsDataset(test_mat))

pipeline = RecommendationPipeline(train_mat.shape, n_factors=20, lr=1e-6)
pipeline.train(train_dl)
losses = pipeline.validate(test_dl)
torch.mean(losses)

  self._set_arrayXarray(i, j, x)


tensor(36.7009)