In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dot, Dense, Add, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout
import numpy as np

## Load Data

In [None]:
import json

def load_data_from_json(filepath):
  """Loads data from a JSON file.

  Args:
    filepath: The path to the JSON file.

  Returns:
    A dictionary containing the data loaded from the JSON file,
    or None if an error occurs.
  """
  try:
    with open(filepath, 'r') as file:
      data = json.load(file)
    return data
  except FileNotFoundError:
    print(f"Error: File not found at {filepath}")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {filepath}")
    return None

In [None]:
data = load_data_from_json('/content/drive/MyDrive/CMPE256/CMPE256_Project/dataset/filter_all_t.json')

# Check if data is loaded successfully
if data:
    # Assuming 'train' key contains the desired data
    if 'train' in data and isinstance(data['train'], list):
        df = pd.DataFrame(data['train'])  # Create DataFrame from 'train' list
        print(df.head())
    else:
        print("Error: 'train' key not found or not a list in the JSON data.")
else:
    print("Error: Data not loaded from JSON file.")

                business_id                user_id  rating  \
0  60567465d335d0abfb415b26  101074926318992653684       4   
1  6050fa9f5b4ccec8d5cae994  117065749986299237881       5   
2  604be10877e81aaed3cc9a1e  106700937793048450809       4   
3  60411e017cd8bf130362365a  101643045857250355161       5   
4  604139dd7cd8bf1303624208  109802745326785766951       4   

                                         review_text  \
0  The tang of the tomato sauce is outstanding. A...   
1              Chicken and waffles were really good!   
2  The appetizer of colossal shrimp was very good...   
3  The fish tacos here  omg! The salad was great ...   
4  Ribs are great, as are the mac and cheese, fri...   

                                                pics  \
0  [AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...   
1     [AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s]   
2  [AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...   
3  [AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...   
4     [AF1

In [None]:
df.keys()

Index(['business_id', 'user_id', 'rating', 'review_text', 'pics',
       'history_reviews'],
      dtype='object')

In [None]:
max_rating = df['rating'].max()
print(f"The maximum rating in the dataset is: {max_rating}")

The maximum rating in the dataset is: 5


In [None]:
min_rating = df['rating'].min()
print(f"The minimum rating in the dataset is: {min_rating}")

The minimum rating in the dataset is: 1


In [None]:
unique_user_ids = df['user_id'].nunique()
total_user_ids = len(df['user_id'])
print(f"Total number of User IDs: {total_user_ids}")
print(f"Number of unique User IDs: {unique_user_ids}")

Total number of User IDs: 87013
Number of unique User IDs: 29596


In [None]:
unique_business_ids = df['business_id'].nunique()
total_business_ids = len(df['business_id'])
print(f"Total number of Business IDs: {total_business_ids}")
print(f"Number of unique Business IDs: {unique_business_ids}")

Total number of Business IDs: 87013
Number of unique Business IDs: 27896


## Matrix_Factorization SVD



In [None]:
!pip uninstall numpy

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Would remove:
    /usr/local/bin/f2py
    /usr/local/bin/numpy-config
    /usr/local/lib/python3.11/dist-packages/numpy-2.0.2.dist-info/*
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libgfortran-040039e1-0352e75f.so.5.0.0
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libquadmath-96973f99-934c22de.so.0.0.0
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libscipy_openblas64_-99b71e71.so
    /usr/local/lib/python3.11/dist-packages/numpy/*
Proceed (Y/n)? y
  Successfully uninstalled numpy-2.0.2


In [None]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [None]:
# need restart the section for numpy==1.26.4

In [None]:
!pip install pandas



In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505224 sha256=8166d5f2259df1751a9933f0bb74148669fcabf20cc26728dc649c3b4358f6b6
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, KFold
from surprise import accuracy # Import the accuracy module
import math

In [None]:
# Normalize
df['rating_normalized'] = (df['rating'] -min_rating)/(max_rating - min_rating)

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Define the reader object
reader = Reader(rating_scale=(min_rating, max_rating))

# Load the data into Surprise's Dataset format
data_surprise = Dataset.load_from_df(train[['user_id', 'business_id', 'rating_normalized']], reader)

# Define the SVD algorithm
model = SVD()

# Define the number of folds for cross-validation
k = 5
kf = KFold(n_splits=k)

# Perform k-fold cross-validation
for trainset, testset in kf.split(data_surprise):
    model.fit(trainset)
    predictions = model.test(testset)

    # Evaluate the model on the testset
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

RMSE: 0.2504
MAE:  0.1371
RMSE: 0.2514
MAE:  0.1396
RMSE: 0.2508
MAE:  0.1377
RMSE: 0.2470
MAE:  0.1341
RMSE: 0.2521
MAE:  0.1409


In [None]:
sum_rmse = 0.2430+ 0.2511+ 0.2554+ 0.2506+ 0.2510
print(sum_rmse/5)

0.25022


In [None]:
sum_mae = 0.1319+0.1383+0.1404+0.1382+0.1398
print(sum_mae/5)

0.63228


## 5-Fold Corss Valdation Evaluation
- MSE / RMSE
- Mean Absolute Error (MAE)
- Ranking: MRR (Mean Reciprocal Rank)
- Ranking: NDCG (Normalized Discounted Cumulative Gain)


In [None]:
test_user_ids = test['user_id'].values
test_business_ids = test['business_id'].values
test_normalized_ratings = test['rating_normalized'].values
test_ratings = test['rating'].values

In [None]:
# Predict ratings for the test data
predicted_ratings = []
for user_id, business_id in zip(test_user_ids, test_business_ids):
    prediction = model.predict(user_id, business_id)  # predict for each user-item pair
    predicted_ratings.append(prediction.est) # est gives the rating prediction

predicted_ratings = np.array(predicted_ratings) # convert the list to array


# Denormalize the predicted ratings
predicted_ratings = predicted_ratings * (max_rating - min_rating) + min_rating

In [None]:
# Calculate RMSE
rmse = np.sqrt(np.mean((predicted_ratings - test_ratings)**2))
print(f"RMSE: {rmse}")

RMSE: 0.9888745146091031


In [None]:
'''

Evaluate the model on the test data with MAE

'''
# Calculate MAE
mae = np.mean(np.abs(predicted_ratings - test_ratings))
print(f"MAE: {mae}")

MAE: 0.5413042555427109


In [None]:
def calculate_mrr(predicted_ratings, test_ratings, k=10):
    """Calculates the Mean Reciprocal Rank (MRR)@k.

    Args:
        predicted_ratings: A NumPy array of predicted ratings.
        test_ratings: A NumPy array of true ratings.
        k: The number of top recommendations to consider.

    Returns:
        The MRR@k.
    """

    mrr_sum = 0
    for i in range(len(test_ratings)):
        # Get the top k recommendations for the current user.
        top_k_indices = np.argsort(predicted_ratings[i])[::-1][:k]

        # Find the rank of the highest-rated item in the top k recommendations.
        for rank, index in enumerate(top_k_indices):
          if index == np.argmax(test_ratings[i]):
            mrr_sum += 1 / (rank + 1)
            break

    return mrr_sum / len(test_ratings)

In [None]:
'''

Evaluate the model on the test data with MRR

'''
mrr_at_10 = calculate_mrr(predicted_ratings, test_ratings, k=10)
print(f"MRR@10: {mrr_at_10}")

MRR@10: 1.0


In [None]:
def dcg_at_k(relevance_scores, k):
    relevance_scores = np.asarray(relevance_scores, dtype=np.float64)[:k]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0

In [None]:
def ndcg_at_k(predicted_scores, true_relevance, k):
    """
    predicted_scores: List or array of predicted scores (higher -> more relevant)
    true_relevance: List or array of ground truth relevance (1 for relevant & 0 for not)
    k: Rank position to evaluate at (k = 10)
    """
    # Sort by predicted scores
    order = np.argsort(predicted_scores)[::-1]
    true_relevance = np.take(true_relevance, order)

    dcg = dcg_at_k(true_relevance, k)
    ideal_dcg = dcg_at_k(sorted(true_relevance, reverse=True), k)

    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

In [None]:
'''

Evaluate the model on the test data with NDCG

'''
k =10
ndcg = ndcg_at_k(predicted_ratings, test_ratings, k)
print(f"NDCG@{k}: {ndcg:.2f}")

NDCG@10: 0.90
