In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dot, Dense, Add, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout
import numpy as np

## Load Data

In [5]:
import json

def load_data_from_json(filepath):
  """Loads data from a JSON file.

  Args:
    filepath: The path to the JSON file.

  Returns:
    A dictionary containing the data loaded from the JSON file,
    or None if an error occurs.
  """
  try:
    with open(filepath, 'r') as file:
      data = json.load(file)
    return data
  except FileNotFoundError:
    print(f"Error: File not found at {filepath}")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {filepath}")
    return None

In [8]:
data = load_data_from_json('/content/drive/MyDrive/CMPE256_COPY/dataset/filter_all_t.json')

# Check if data is loaded successfully
if data:
    # Assuming 'train' key contains the desired data
    if 'train' in data and isinstance(data['train'], list):
        df = pd.DataFrame(data['train'])  # Create DataFrame from 'train' list
        print(df.head())
    else:
        print("Error: 'train' key not found or not a list in the JSON data.")
else:
    print("Error: Data not loaded from JSON file.")

                business_id                user_id  rating  \
0  60567465d335d0abfb415b26  101074926318992653684       4   
1  6050fa9f5b4ccec8d5cae994  117065749986299237881       5   
2  604be10877e81aaed3cc9a1e  106700937793048450809       4   
3  60411e017cd8bf130362365a  101643045857250355161       5   
4  604139dd7cd8bf1303624208  109802745326785766951       4   

                                         review_text  \
0  The tang of the tomato sauce is outstanding. A...   
1              Chicken and waffles were really good!   
2  The appetizer of colossal shrimp was very good...   
3  The fish tacos here  omg! The salad was great ...   
4  Ribs are great, as are the mac and cheese, fri...   

                                                pics  \
0  [AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...   
1     [AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s]   
2  [AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...   
3  [AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...   
4     [AF1

In [9]:
df.keys()

Index(['business_id', 'user_id', 'rating', 'review_text', 'pics',
       'history_reviews'],
      dtype='object')

In [10]:
max_rating = df['rating'].max()
print(f"The maximum rating in the dataset is: {max_rating}")

The maximum rating in the dataset is: 5


In [11]:
min_rating = df['rating'].min()
print(f"The minimum rating in the dataset is: {min_rating}")

The minimum rating in the dataset is: 1


In [12]:
unique_user_ids = df['user_id'].nunique()
total_user_ids = len(df['user_id'])
print(f"Total number of User IDs: {total_user_ids}")
print(f"Number of unique User IDs: {unique_user_ids}")

Total number of User IDs: 87013
Number of unique User IDs: 29596


In [13]:
unique_business_ids = df['business_id'].nunique()
total_business_ids = len(df['business_id'])
print(f"Total number of Business IDs: {total_business_ids}")
print(f"Number of unique Business IDs: {unique_business_ids}")

Total number of Business IDs: 87013
Number of unique Business IDs: 27896


## Item-Based Collaborative Filtering

In [14]:
from collections import defaultdict

In [15]:
usersPerItem = defaultdict(set)  # Maps an item to the users who rated it (business profile)
itemsPerUser = defaultdict(set)  # Maps a user to the items that they rated (user profile)

ratingDict = {}

for index, row in df.iterrows():
    user, item = row['user_id'], row['business_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user, item)] = row['rating']

In [16]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u, i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)

for i in usersPerItem:
    rs = [ratingDict[(u, i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [17]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [18]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]  #find all users who purchased i
    for i2 in usersPerItem:  #iterate over all other items (profiles) and compute their similarity to i in terms of common users
        if i2 == i:
            continue
        sim = Jaccard(users, usersPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [19]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for index, d in df.iterrows():
    user, item = d['user_id'], d['business_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [20]:
ratingMean = sum([d['rating'] for _, d in df.iterrows()]) / len(df['business_id'])
print(ratingMean)

4.465252318619057


In [21]:
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['business_id']
        if i2 == item:
            continue
        ratings.append(d['rating']) #rating of user for item i2
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2])) #similarity of item i2 to item for which we want to predict
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x, y in zip(ratings, similarities)] #weighted ratings
        return sum(weightedRatings) / sum(similarities) #weighted average
    else:
        # User hasn't rated any similar items
        return ratingMean

In [22]:
import math

In [23]:
def RMSE(predictions, labels):
    differences = [(x-y)**2 for x, y in zip(predictions, labels)]
    return math.sqrt(sum(differences) / len(differences))

In [24]:
simPredictions = [predictRating(
    d['user_id'], d['business_id']) for _, d in df.iterrows()]

In [25]:
labels = [d['rating'] for _, d in df.iterrows()]

In [26]:
RMSE(simPredictions, labels)

0.9466303143791958

## User-Based Collaborative Filtering


In [27]:
def JaccardUser(u1, u2):
    # compute similarity between two users based on their item sets
    numer = len(itemsPerUser[u1].intersection(itemsPerUser[u2]))
    denom = len(itemsPerUser[u1].union(itemsPerUser[u2]))
    if denom == 0:
        return 0
    return numer / denom

In [28]:
def predictRatingUser(user, item, N=10):
    # aggregate ratings from other users who rated this item
    ratings = []
    similarities = []
    for u2 in usersPerItem[item]:
        if u2 == user:
            continue
        ratings.append(ratingDict[(u2, item)])
        similarities.append(JaccardUser(user, u2))
    if sum(similarities) > 0:
        weightedRatings = [r * s for r, s in zip(ratings, similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # fallback to this user's average (or global mean)
        return userAverages.get(user, ratingMean)

In [29]:
# Evaluate user-based CF on all (user,item) pairs
userPredictions = [
    predictRatingUser(d['user_id'], d['business_id'])
    for _, d in df.iterrows()
]
print("User-based CF RMSE:", RMSE(userPredictions, labels))

User-based CF RMSE: 0.8902976257225924
