In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Add, Dense, Dropout, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout
import numpy as np

## Load Data

In [3]:
import json

def load_data_from_json(filepath):
  """Loads data from a JSON file.

  Args:
    filepath: The path to the JSON file.

  Returns:
    A dictionary containing the data loaded from the JSON file,
    or None if an error occurs.
  """
  try:
    with open(filepath, 'r') as file:
      data = json.load(file)
    return data
  except FileNotFoundError:
    print(f"Error: File not found at {filepath}")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {filepath}")
    return None

In [4]:
data = load_data_from_json('/content/drive/MyDrive/CMPE256_COPY/dataset/filter_all_t.json')

# Check if data is loaded successfully
if data:
    # Assuming 'train' key contains the desired data
    if 'train' in data and isinstance(data['train'], list):
        df = pd.DataFrame(data['train'])  # Create DataFrame from 'train' list
        print(df.head())
    else:
        print("Error: 'train' key not found or not a list in the JSON data.")
else:
    print("Error: Data not loaded from JSON file.")

                business_id                user_id  rating  \
0  60567465d335d0abfb415b26  101074926318992653684       4   
1  6050fa9f5b4ccec8d5cae994  117065749986299237881       5   
2  604be10877e81aaed3cc9a1e  106700937793048450809       4   
3  60411e017cd8bf130362365a  101643045857250355161       5   
4  604139dd7cd8bf1303624208  109802745326785766951       4   

                                         review_text  \
0  The tang of the tomato sauce is outstanding. A...   
1              Chicken and waffles were really good!   
2  The appetizer of colossal shrimp was very good...   
3  The fish tacos here  omg! The salad was great ...   
4  Ribs are great, as are the mac and cheese, fri...   

                                                pics  \
0  [AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...   
1     [AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s]   
2  [AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...   
3  [AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...   
4     [AF1

In [5]:
df.keys()

Index(['business_id', 'user_id', 'rating', 'review_text', 'pics',
       'history_reviews'],
      dtype='object')

In [6]:
max_rating = df['rating'].max()
print(f"The maximum rating in the dataset is: {max_rating}")

The maximum rating in the dataset is: 5


In [7]:
min_rating = df['rating'].min()
print(f"The minimum rating in the dataset is: {min_rating}")

The minimum rating in the dataset is: 1


In [8]:
unique_user_ids = df['user_id'].nunique()
total_user_ids = len(df['user_id'])
print(f"Total number of User IDs: {total_user_ids}")
print(f"Number of unique User IDs: {unique_user_ids}")

Total number of User IDs: 87013
Number of unique User IDs: 29596


In [9]:
unique_business_ids = df['business_id'].nunique()
total_business_ids = len(df['business_id'])
print(f"Total number of Business IDs: {total_business_ids}")
print(f"Number of unique Business IDs: {unique_business_ids}")

Total number of Business IDs: 87013
Number of unique Business IDs: 27896


In [10]:
#preprocess review_text into embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# 1A) Fill any NAs
df['review_text'] = df['review_text'].fillna('')

# 1B) Compute a TF-IDF matrix (n_samples × vocab_size)
tfidf = TfidfVectorizer(max_features=5_000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['review_text'])

# 1C) Reduce to a dense 64-dim embedding per review
svd = TruncatedSVD(n_components=64, random_state=42)
text_emb = svd.fit_transform(X_tfidf)

# 1D) Attach those 64 dims back to your DataFrame
text_cols = [f'text_emb_{i}' for i in range(64)]
df_text = pd.DataFrame(text_emb, columns=text_cols, index=df.index)
df = pd.concat([df, df_text], axis=1)

## Matrix Factorization Collaborative Filtering

In [11]:
'''
Encoding user_id and business_id in df
'''

# Separate encoders
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

# Fit on training data only
df['user_id_encoded'] = user_encoder.fit_transform(df['user_id'])
df['business_id_encoded'] = business_encoder.fit_transform(df['business_id'])

In [12]:
# Normalize
df['rating_normalized'] = (df['rating'] -min_rating)/(max_rating - min_rating)

In [13]:
df.head()

Unnamed: 0,business_id,user_id,rating,review_text,pics,history_reviews,text_emb_0,text_emb_1,text_emb_2,text_emb_3,...,text_emb_57,text_emb_58,text_emb_59,text_emb_60,text_emb_61,text_emb_62,text_emb_63,user_id_encoded,business_id_encoded,rating_normalized
0,60567465d335d0abfb415b26,101074926318992653684,4,The tang of the tomato sauce is outstanding. A...,"[AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...",[[101074926318992653684_6056272797d555cc6fb0d1...,0.090685,0.028812,-0.020918,-0.015585,...,0.064262,-0.088032,-0.012349,-0.017169,0.009698,0.011253,-0.069798,1854,26649,0.75
1,6050fa9f5b4ccec8d5cae994,117065749986299237881,5,Chicken and waffles were really good!,[AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s],[[117065749986299237881_605206f8d8c08f462b93e8...,0.319936,-0.117113,-0.174905,0.246191,...,0.003411,0.076694,-0.081512,-0.033307,0.075623,-0.132462,-0.135888,27375,19273,1.0
2,604be10877e81aaed3cc9a1e,106700937793048450809,4,The appetizer of colossal shrimp was very good...,"[AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...",[[106700937793048450809_6044300b27f39b7b5d1dbf...,0.203961,-0.036089,-0.021253,0.042145,...,0.003738,-0.011642,0.008768,-0.011292,0.003472,0.010374,0.008503,10822,13238,0.75
3,60411e017cd8bf130362365a,101643045857250355161,5,The fish tacos here omg! The salad was great ...,"[AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...",[[101643045857250355161_604fbdd099686c10168c91...,0.20398,0.001087,-0.039955,0.030524,...,-0.008188,0.016564,-0.040966,-0.001074,-0.021022,-0.031655,-0.012074,2779,569,1.0
4,604139dd7cd8bf1303624208,109802745326785766951,4,"Ribs are great, as are the mac and cheese, fri...",[AF1QipNVys4yq-5w_3EsDdHpSc9ZNb7Nl30Mfb6Y0Gup],[[109802745326785766951_60524fa9f09a4ffff042f9...,0.174151,-0.025524,0.100808,-0.053062,...,0.085499,-0.009104,0.116744,-0.129016,0.053597,-0.008909,0.007128,15813,667,0.75


In [14]:
# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_user = train_df['user_id_encoded'].values
train_biz  = train_df['business_id_encoded'].values
train_text = train_df[text_cols].values
train_r    = train_df['rating_normalized'].values

test_user = test_df['user_id_encoded'].values
test_biz  = test_df['business_id_encoded'].values
test_text = test_df[text_cols].values
test_r    = test_df['rating_normalized'].values

In [15]:
# Build the model with an extra text branch

num_users = df['user_id_encoded'].nunique()
num_biz   = df['business_id_encoded'].nunique()

user_in = Input(shape=(1,), name='user_input')
biz_in  = Input(shape=(1,), name='business_input')
text_in = Input(shape=(64,), name='text_input')

# Embedding towers
user_emb  = Embedding(input_dim=num_users+1, output_dim=64, name='user_emb')(user_in)
user_vec  = Flatten()(user_emb)

biz_emb   = Embedding(input_dim=num_biz+1, output_dim=64, name='biz_emb')(biz_in)
biz_vec   = Flatten()(biz_emb)

# Bias terms
user_bias = Flatten()(Embedding(num_users+1, 1, name='user_bias')(user_in))
biz_bias  = Flatten()(Embedding(num_biz+1, 1, name='biz_bias')(biz_in))

# Interaction (MF) part
dot       = Dot(axes=1)([user_vec, biz_vec])
interaction = Add()([dot, user_bias, biz_bias])

# Inserted: text branch
x_text    = Dense(64, activation='relu')(text_in)
x_text    = Dropout(0.2)(x_text)

# Combine\ n# Modified: include text branch in concatenation
x = Concatenate()([interaction, x_text])
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)

# Output\ n# (Unchanged)
output = Dense(1, activation='linear')(x)

# Model definition
# Modified: now three inputs
model = Model(inputs=[user_in, biz_in, text_in], outputs=output)
model.compile(optimizer='adam', loss='mse')
model.summary()

In [16]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [17]:
# Check if GPU is available
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("CPU")

Default GPU Device: /device:GPU:0


In [19]:
epochs = 10  # adjust as needed
batch_size = 512

with tf.device('/device:GPU:0'):
  history = model.fit(
      [train_user, train_biz, train_text], train_r,
      validation_data=([test_user, test_biz, test_text], test_r),
      epochs=epochs,
      batch_size=batch_size
  )

Epoch 1/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 33ms/step - loss: 0.2430 - val_loss: 0.0411
Epoch 2/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0457 - val_loss: 0.0406
Epoch 3/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0316 - val_loss: 0.0427
Epoch 4/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0135 - val_loss: 0.0406
Epoch 5/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0066 - val_loss: 0.0413
Epoch 6/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0042 - val_loss: 0.0411
Epoch 7/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0026 - val_loss: 0.0419
Epoch 8/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0018 - val_loss: 0.0428
Epoch 9/10
[1m136/136[0m [32m━━━━━━

In [20]:
path = '/content/drive/MyDrive/CMPE256_COPY/Models/NCF_text.keras'
model.save(path)

In [21]:
# Load model
from tensorflow import keras
path = '/content/drive/MyDrive/CMPE256_COPY/Models/NCF_text.keras'
model = keras.models.load_model(path)

## Evaluation
- MSE / RMSE
- Mean Absolute Error (MAE)
- Ranking: MRR (Mean Reciprocal Rank)
- Ranking: NDCG (Normalized Discounted Cumulative Gain)


In [24]:
'''

Evaluate the model on the test data with RMSE

'''
# Evaluate the model on the test data
loss = model.evaluate([test_user, test_biz, test_text], test_r, verbose=0)
print(f"Test Loss: {loss}")

# Predict normalized ratings for the test data
pred_norm = model.predict([test_user, test_biz, test_text])

# Denormalize the predicted ratings
pred = pred_norm * (max_rating - min_rating) + min_rating

# Original ratings for test data
test_orig = test_df['rating'].values

# Calculate RMSE
rmse = np.sqrt(np.mean((pred - test_orig)**2))
print(f"RMSE: {rmse}")

Test Loss: 0.04292108491063118
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSE: 0.9292431971945311


In [28]:
# MAE
mae = mean_absolute_error(test_orig, pred)
print(f"MAE: {mae}")

MAE: 0.6155439019203186


In [29]:
# Define relevance threshold (e.g., >=4 is relevant)
def mrr_score(true_ratings, preds, threshold=4):
    order = np.argsort(preds)[::-1]
    for i, idx in enumerate(order):
        if true_ratings[idx] >= threshold:
            return 1.0 / (i + 1)
    return 0.0

In [30]:
def ndcg_score(true_ratings, preds, k=10):
    order = np.argsort(preds)[::-1][:k]
    dcg = 0.0
    for i, idx in enumerate(order):
        rel = true_ratings[idx]
        dcg += (2**rel - 1) / np.log2(i + 2)
    # Ideal DCG
    ideal_order = np.argsort(true_ratings)[::-1][:k]
    idcg = 0.0
    for i, idx in enumerate(ideal_order):
        rel = true_ratings[idx]
        idcg += (2**rel - 1) / np.log2(i + 2)
    return dcg / idcg if idcg > 0 else 0.0

In [32]:
# Compute metrics across users
mrrs = []
ndcgs = []
unique_users = np.unique(test_user)
for uid in unique_users:
    idx = np.where(test_user == uid)[0]
    tr = test_orig[idx]
    pr = pred[idx]
    if len(tr) > 0:
        mrrs.append(mrr_score(tr, pr))
        ndcgs.append(ndcg_score(tr, pr, k=min(10, len(tr))))

mrr = np.mean(mrrs) if mrrs else 0
ndcg = np.mean(ndcgs) if ndcgs else 0
print(f"MRR: {mrr}")
print(f"NDCG: {ndcg}")

MRR: 0.8886377335744424
NDCG: 0.9893022376947584
