In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import unicodedata
import json
import os
import concurrent.futures
import time
from surprise import *
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
import glob
import matplotlib.pyplot as plt

In [7]:
# load reviews
def load_reviews_surprise(filename):
    # read jsonl file and convert to pandas dataframe
    reviews = []
    with open(filename) as f:
        for line in f:
            reviews.append(json.loads(line))
    ratings_df = pd.DataFrame(reviews)
    ratings_df = ratings_df.dropna()  # Remove rows with NaN values
    # Define a Surprise Reader object
    reader = Reader(rating_scale=(0, 5))

    # Load dataset into Surprise format
    data = Dataset.load_from_df(ratings_df[['user', 'title', 'rating']], reader)
    return data

def load_reviews_pandas(filename):
    # read jsonl file and convert to pandas dataframe
    reviews = []
    with open(filename) as f:
        for line in f:
            reviews.append(json.loads(line))
    ratings_df = pd.DataFrame(reviews)
    ratings_df = ratings_df.dropna()  # Remove rows with NaN values

    return ratings_df[['user', 'title', 'rating']]

# Load in small (100) review data

In [2]:
reviews = []
filename = '../letterboxd_proj_data/usernames_sample_100.jsonl'
with open(filename) as f:
    for line in f:
        reviews.append(json.loads(line))

In [3]:
ratings_df = pd.DataFrame(reviews)
ratings_df = ratings_df.dropna()  # Remove rows with NaN values
# Define a Surprise Reader object
reader = Reader(rating_scale=(0, 5))

# Load dataset into Surprise format
data = Dataset.load_from_df(ratings_df[['user', 'title', 'rating']], reader)

# Create a train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Trying different surprise lib models

In [32]:
models = [SVD(), NMF(), SlopeOne(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), KNNBaseline()]

In [5]:
# for model in models:
#     print(model)
    
#     trainset = data.build_full_trainset()
#     start = time.time()
#     model.fit(trainset)
#     print(time.time()-start)

#     # Prepare testset for predictions
#     testset = trainset.build_testset()
#     predictions = model.test(testset)

#     # Evaluate the model
#     rmse = accuracy.rmse(predictions)

# 2 step param search with NMF

In [None]:
# Define the parameter grid
param_grid = {
    'n_factors': [5, 10],  # Number of latent factors
    'n_epochs': [10, 50, 100],   # Number of epochs
    'lr_bi': [0.002, 0.005],  # Learning rate
    'lr_bu': [0.002, 0.005],  # Learning rate
    # 'reg_pu': [0.01, 0.02, 0],  # Regularization term
    # 'reg_qi': [0.01, 0.02, 0.1],  # Regularization term
    # 'reg_bi': [0.01, 0.02, 0.1],  # Regularization term
    'biased': [True],

}

# Set up GridSearchCV
grid_search = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=5)

# Perform the grid search
grid_search.fit(data)

# Get the best parameters
print("Best parameters:", grid_search.best_params)

# Get the best RMSE score
print("Best RMSE:", grid_search.best_score)

# Get the best model
best_model = grid_search.best_estimator['rmse']

# Optionally, you can fit the best model on the full training set and evaluate it on the test set
trainset = data.build_full_trainset()
best_model.fit(trainset)
testset = trainset.build_testset()
predictions = best_model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

Best parameters: {'rmse': {'n_factors': 10, 'n_epochs': 100, 'lr_bi': 0.005, 'lr_bu': 0.005, 'biased': True}}
Best RMSE: {'rmse': 0.8371488540008099}
RMSE: 0.5699
Test RMSE: 0.5698605988228671


In [8]:
# Define the parameter grid
param_grid = {
    'n_factors': [10],  # Number of latent factors
    'n_epochs': [100, 200],   # Number of epochs
    'lr_bi': [0.005, 0.01, 0.1],  # Learning rate
    'lr_bu': [0.005, 0.01, 0.1],  # Learning rate
    # 'reg_pu': [0.01, 0.02, 0],  # Regularization term
    # 'reg_qi': [0.01, 0.02, 0.1],  # Regularization term
    # 'reg_bi': [0.01, 0.02, 0.1],  # Regularization term
    'biased': [True],

}

# Set up GridSearchCV
grid_search = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=5)

# Perform the grid search
grid_search.fit(data)

# Get the best parameters
print("Best parameters:", grid_search.best_params)

# Get the best RMSE score
print("Best RMSE:", grid_search.best_score)

# Get the best model
best_model = grid_search.best_estimator['rmse']

# Optionally, you can fit the best model on the full training set and evaluate it on the test set
trainset = data.build_full_trainset()
best_model.fit(trainset)
testset = trainset.build_testset()
predictions = best_model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

Best parameters: {'rmse': {'n_factors': 10, 'n_epochs': 200, 'lr_bi': 0.01, 'lr_bu': 0.005, 'biased': True}}
Best RMSE: {'rmse': 0.8219330938202342}
RMSE: 0.5383
Test RMSE: 0.5382900734239495


# NMF with best params with larger dataset
- Best parameters: {'rmse': {'n_factors': 10, 'n_epochs': 200, 'lr_bi': 0.01, 'lr_bu': 0.005, 'biased': True}}

In [6]:
data_1000 = load_reviews('../letterboxd_proj_data/usernames_sample_100_1000.jsonl')

In [13]:
NMF_model = NMF(n_factors = 10, n_epochs = 200, lr_bi = 0.01, lr_bu = 0.005, biased = True)
# Optionally, you can fit the best model on the full training set and evaluate it on the test set
trainset = data_1000.build_full_trainset()
NMF_model.fit(trainset)
testset = trainset.build_testset()
predictions = NMF_model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

RMSE: 0.6724
Test RMSE: 0.6723623926750961


In [8]:
# linear regression with user and item means
data_1000.head()

AttributeError: 'DatasetAutoFolds' object has no attribute 'head'