In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dot, Dense, Add, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout
import numpy as np

## Load Data

In [None]:
import json

def load_data_from_json(filepath):
  """Loads data from a JSON file.

  Args:
    filepath: The path to the JSON file.

  Returns:
    A dictionary containing the data loaded from the JSON file,
    or None if an error occurs.
  """
  try:
    with open(filepath, 'r') as file:
      data = json.load(file)
    return data
  except FileNotFoundError:
    print(f"Error: File not found at {filepath}")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {filepath}")
    return None

In [None]:
data = load_data_from_json('/content/drive/MyDrive/CMPE256/CMPE256_Project/dataset/filter_all_t.json')

# Check if data is loaded successfully
if data:
    # Assuming 'train' key contains the desired data
    if 'train' in data and isinstance(data['train'], list):
        df = pd.DataFrame(data['train'])  # Create DataFrame from 'train' list
        print(df.head())
    else:
        print("Error: 'train' key not found or not a list in the JSON data.")
else:
    print("Error: Data not loaded from JSON file.")

                business_id                user_id  rating  \
0  60567465d335d0abfb415b26  101074926318992653684       4   
1  6050fa9f5b4ccec8d5cae994  117065749986299237881       5   
2  604be10877e81aaed3cc9a1e  106700937793048450809       4   
3  60411e017cd8bf130362365a  101643045857250355161       5   
4  604139dd7cd8bf1303624208  109802745326785766951       4   

                                         review_text  \
0  The tang of the tomato sauce is outstanding. A...   
1              Chicken and waffles were really good!   
2  The appetizer of colossal shrimp was very good...   
3  The fish tacos here  omg! The salad was great ...   
4  Ribs are great, as are the mac and cheese, fri...   

                                                pics  \
0  [AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...   
1     [AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s]   
2  [AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...   
3  [AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...   
4     [AF1

In [None]:
df.keys()

Index(['business_id', 'user_id', 'rating', 'review_text', 'pics',
       'history_reviews'],
      dtype='object')

In [None]:
max_rating = df['rating'].max()
print(f"The maximum rating in the dataset is: {max_rating}")

The maximum rating in the dataset is: 5


In [None]:
min_rating = df['rating'].min()
print(f"The minimum rating in the dataset is: {min_rating}")

The minimum rating in the dataset is: 1


In [None]:
unique_user_ids = df['user_id'].nunique()
total_user_ids = len(df['user_id'])
print(f"Total number of User IDs: {total_user_ids}")
print(f"Number of unique User IDs: {unique_user_ids}")

Total number of User IDs: 87013
Number of unique User IDs: 29596


In [None]:
unique_business_ids = df['business_id'].nunique()
total_business_ids = len(df['business_id'])
print(f"Total number of Business IDs: {total_business_ids}")
print(f"Number of unique Business IDs: {unique_business_ids}")

Total number of Business IDs: 87013
Number of unique Business IDs: 27896


## Matrix Factorization Collaborative Filtering

In [None]:
'''
Encoding user_id and business_id in df
'''

# Separate encoders
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

# Fit on training data only
df['user_id_encoded'] = user_encoder.fit_transform(df['user_id'])
df['business_id_encoded'] = business_encoder.fit_transform(df['business_id'])

In [None]:
# Normalize
df['rating_normalized'] = (df['rating'] -min_rating)/(max_rating - min_rating)

In [None]:
df.head()

Unnamed: 0,business_id,user_id,rating,review_text,pics,history_reviews,user_id_encoded,business_id_encoded,rating_normalized
0,60567465d335d0abfb415b26,101074926318992653684,4,The tang of the tomato sauce is outstanding. A...,"[AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...",[[101074926318992653684_6056272797d555cc6fb0d1...,1854,26649,0.75
1,6050fa9f5b4ccec8d5cae994,117065749986299237881,5,Chicken and waffles were really good!,[AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s],[[117065749986299237881_605206f8d8c08f462b93e8...,27375,19273,1.0
2,604be10877e81aaed3cc9a1e,106700937793048450809,4,The appetizer of colossal shrimp was very good...,"[AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...",[[106700937793048450809_6044300b27f39b7b5d1dbf...,10822,13238,0.75
3,60411e017cd8bf130362365a,101643045857250355161,5,The fish tacos here omg! The salad was great ...,"[AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...",[[101643045857250355161_604fbdd099686c10168c91...,2779,569,1.0
4,604139dd7cd8bf1303624208,109802745326785766951,4,"Ribs are great, as are the mac and cheese, fri...",[AF1QipNVys4yq-5w_3EsDdHpSc9ZNb7Nl30Mfb6Y0Gup],[[109802745326785766951_60524fa9f09a4ffff042f9...,15813,667,0.75


In [None]:
embedding_dim = 64

unique_user_ids = df['user_id'].nunique()
unique_business_ids = df['business_id'].nunique()

In [None]:
'''

User Tower

'''
user_input_placeholder = Input(shape=(1,), name='user_input')

#convert the user ID into a dense embedding vector of size embedding_dim
user_embedding = Embedding(input_dim=unique_user_ids + 1,
                           output_dim=embedding_dim,
                           name='user_embedding',
                           embeddings_regularizer=l2(1e-6))(user_input_placeholder)

#remove extra dimension so the embedding becomes a simple 1D vector
user_embedding = Flatten()(user_embedding)

In [None]:
'''

Business Tower

'''
business_input_placeholder = Input(shape=(1,), name='business_input')

#convert the business ID into a dense embedding vector of size embedding_dim
business_embedding = Embedding(input_dim=unique_business_ids + 1,
                           output_dim=embedding_dim,
                           name='business_embedding',
                           embeddings_regularizer=l2(1e-6))(business_input_placeholder)

#remove extra dimension so the embedding becomes a simple 1D vector
business_embedding = Flatten()(business_embedding)

In [None]:
'''

Traning model wih 2 Tower architecture

'''

user_bias = Embedding(unique_user_ids, 1)(user_input_placeholder)
business_bias = Embedding(unique_business_ids, 1)(business_input_placeholder)

user_bias = Flatten()(user_bias)
business_bias = Flatten()(business_bias)

# Interaction + Metadata
dot_product = Dot(axes=1)([user_embedding, business_embedding])
interaction = Add()([dot_product, user_bias, business_bias])


# Output layer for predicting rating
x = Dense(128, activation='relu')(interaction)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Model
model = Model(inputs=[user_input_placeholder,
                      business_input_placeholder], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mse')

In [None]:
model.summary()

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Extract features and labels for training
train_user_ids = train['user_id_encoded'].values
train_business_ids = train['business_id_encoded'].values
train_normalized_ratings = train['rating_normalized'].values


test_user_ids = test['user_id_encoded'].values
test_business_ids = test['business_id_encoded'].values
test_normalized_ratings = test['rating_normalized'].values
test_ratings = test['rating'].values

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [None]:
# Early stopping to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,              # Stop after 3 bad epochs
    restore_best_weights=True,
    verbose=1
)

# Reduce learning rate when validation loss plateaus
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,              # Reduce by half
    patience=2,              # Wait 2 epochs
    min_lr=1e-6,             # Don't go below this
    verbose=1
)

checkpoint = ModelCheckpoint(
    filepath='/content/drive/MyDrive/CMPE256/CMPE256_Projectt/Models/NCF.keras',   # <- set save path
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

In [None]:
# Check if GPU is available
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("CPU")

CPU


In [None]:
# Train the model
with tf.device('/device:GPU:0'):
  history = model.fit(
      [train_user_ids,
       train_business_ids],
      train_normalized_ratings,
      validation_data=(
          [test_user_ids,
           test_business_ids],
          test_normalized_ratings
      ),
      epochs=20,
      batch_size=512,
      callbacks=[reduce_lr, checkpoint]
  )

Epoch 1/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 0.2788
Epoch 1: val_loss improved from inf to 0.04444, saving model to /content/drive/MyDrive/CMPE256/CMPE256_Project/NCF.keras
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - loss: 0.2776 - val_loss: 0.0444 - learning_rate: 0.0010
Epoch 2/20
[1m135/136[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 39ms/step - loss: 0.0513
Epoch 2: val_loss improved from 0.04444 to 0.04369, saving model to /content/drive/MyDrive/CMPE256/CMPE256_Project/NCF.keras
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 43ms/step - loss: 0.0513 - val_loss: 0.0437 - learning_rate: 0.0010
Epoch 3/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.0447
Epoch 3: val_loss did not improve from 0.04369
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 50ms/step - loss: 0.0447 - val_loss: 0.0443 - learning_rate: 0.001

In [None]:
path = '/content/drive/MyDrive/CMPE256/CMPE256_Projectt/Models/final_NCF.keras'
model.save(path)

In [None]:
# Load model
from tensorflow import keras
path = '/content/drive/MyDrive/CMPE256/CMPE256_Project/Models/final_NCF.keras'
model = keras.models.load_model(path)

In [None]:
# Evaluate the model on the test data
loss = model.evaluate([test_user_ids, test_business_ids], test_normalized_ratings, verbose=0)
print(f"Test Loss: {loss}")

# Predict ratings for the test data
predicted_normalized_ratings = model.predict([test_user_ids, test_business_ids])

# Denormalize the predicted ratings
predicted_ratings = predicted_normalized_ratings * (max_rating - min_rating) + min_rating

# Calculate RMSE
rmse = np.sqrt(np.mean((predicted_ratings - test_ratings)**2))
print(f"RMSE: {rmse}")

Test Loss: 0.04999932274222374
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSE: 0.9646617121289273
