In [1]:
import random
import numpy as np
import openai
import pandas as pd
import os
import sys
import time
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import openai
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from evaluation_utils import *
from path_utils import *
from ChatCompletion_OpenAI_API import *
from CF_utils import *
from MF_utils import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# source code folder path
rec_sys_dir = get_rec_sys_directory()
print(f"Rec-sys directory: {rec_sys_dir}")

# data folder path
DATA_DIR = os.path.join(rec_sys_dir, 'data')
print(f"Data directory: {DATA_DIR}")

# data path
data_path = os.path.join(DATA_DIR, 'amazon-beauty/large_merged_data.csv')
print(f'Data path: {data_path}')

# output

CF_OUTPUT_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_OUTPUT_PATH}')

CF_RERUN_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/rerun_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_RERUN_PATH}')


# Constants for column names
USER_COLUMN_NAME = 'reviewerID'
TITLE_COLUMN_NAME = 'title'
ITEM_ID_COLUMN = 'asin'
RATING_COLUMN_NAME = 'rating'
TIME_STAMP_COLUMN_NAME = 'unixReviewTime'

# num_ratings_per_user
NUM_RATINGS_PER_USER = 1
# num_main_user_ratings
NUM_MAIN_USER_RATINGS = 4
# num_similar_users
NUM_SIMILAR_USERS = 4

SYSTEM_CONTENT = AMAZON_CONTENT_SYSTEM




: 

In [None]:
import numpy as np
from keras.models import Model
from keras.layers import Embedding, Input, Flatten, Multiply, Concatenate, Dense
from keras.initializers import RandomNormal
from keras.regularizers import l2

def get_NCF_model(num_users, num_items, mf_dim=10, layers=[10], reg_layers=[0], reg_mf=0):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers)  # Number of layers in the MLP

    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')
    
    # Embedding layer
    MF_Embedding_User = Embedding(input_dim=num_users, output_dim=mf_dim, name='mf_embedding_user',
                                  embeddings_initializer=RandomNormal(), embeddings_regularizer=l2(reg_mf), input_length=1)
    MF_Embedding_Item = Embedding(input_dim=num_items, output_dim=mf_dim, name='mf_embedding_item',
                                  embeddings_initializer=RandomNormal(), embeddings_regularizer=l2(reg_mf), input_length=1)

    MLP_Embedding_User = Embedding(input_dim=num_users, output_dim=int(layers[0] / 2), name="mlp_embedding_user",
                                   embeddings_initializer=RandomNormal(), embeddings_regularizer=l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim=num_items, output_dim=int(layers[0] / 2), name='mlp_embedding_item',
                                   embeddings_initializer=RandomNormal(), embeddings_regularizer=l2(reg_layers[0]), input_length=1)
    
    # MF part
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mf_vector = Multiply()([mf_user_latent, mf_item_latent])

    # MLP part 
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], activation='relu', name="layer%d" % idx, kernel_regularizer=l2(reg_layers[idx]))
        mlp_vector = layer(mlp_vector)

    # Concatenate MF and MLP parts
    predict_vector = Concatenate()([mf_vector, mlp_vector])
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name="prediction")(predict_vector)
    
    model = Model(inputs=[user_input, item_input], outputs=prediction)
    
    return model




: 

In [None]:
CF_OUTPUT_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_timestamp/timestamp_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_OUTPUT_TIMESTAMP_PATH}')

CF_RERUN_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_timestamp/rerun_timestamp_large_CF_fewshot_output_path_ratings_per_user.csv')
print(f'Data path: {CF_RERUN_TIMESTAMP_PATH}')

In [None]:
data = pd.read_csv(data_path)
data.head(3)

In [None]:
num_users = data[USER_COLUMN_NAME].nunique()
num_items = data[ITEM_ID_COLUMN].nunique()

print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")

In [None]:
# Create User-Item Interaction Matrix
interaction_matrix = pd.pivot_table(data, index=USER_COLUMN_NAME, columns=ITEM_ID_COLUMN, values=RATING_COLUMN_NAME).fillna(0)
csr_interaction_matrix = csr_matrix(interaction_matrix.values)

interaction_matrix

In [None]:
csr_interaction_matrix

In [None]:
# %%
# Compute the user-user Pearson Correlation Coefficient Matrix
user_pcc_matrix = pearson_correlation(csr_interaction_matrix)
print(f'User PCC Matrix:\n{user_pcc_matrix}\n')



In [None]:
# Compute the item-item Pearson Correlation Coefficient Matrix
# Assuming the function 'item_pearson_correlation' takes a dense matrix as input.
# If it still takes a csr_matrix, then convert it inside the function.
dense_interaction_matrix = csr_interaction_matrix.toarray()

item_pcc_matrix = item_pearson_correlation(dense_interaction_matrix.T)
print(f'Item PCC Matrix:\n{item_pcc_matrix}\n')


# Split Random

In [None]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='random',
    save_path=CF_OUTPUT_PATH, 
    seed=RANDOM_STATE
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

# Split by Timestamp

In [None]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save_sequential(
    data=data, 
    user_pcc_matrix=user_pcc_matrix, 
    item_pcc_matrix=item_pcc_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='sequential',
    save_path=CF_OUTPUT_TIMESTAMP_PATH, 
    seed=RANDOM_STATE
)



In [None]:
saved_data = pd.read_csv(CF_OUTPUT_TIMESTAMP_PATH)

# Display the original data types
print("Original Data Types:")
print(saved_data.dtypes)
print("\n")

# Attempt to convert ratings to float and add a flag for conversion failure
saved_data['is_rating_float'] = pd.to_numeric(saved_data['predicted_rating'], errors='coerce').notna()

# Filter rows where ratings are not float
non_float_ratings = saved_data[saved_data['is_rating_float'] == False]

# total number of rows with non-float ratings
print(f"Total number of rows with non-float ratings: {len(non_float_ratings)}")

# rerun indices for non-float ratings
rerun_indices = non_float_ratings.index.tolist()
print(f"Rerun indices: {rerun_indices}")

# Display rows with non-float ratings
print("Rows with non-float ratings:")
non_float_ratings.head(3)


In [None]:

def rerun_failed_CF_item_PCC_predictions_sequential(data, user_pcc_matrix, item_pcc_matrix,
                                         save_path, user_column_name, movie_column_name,
                                         movie_id_column, rating_column_name,
                                         num_ratings_per_user, num_main_user_ratings, num_similar_users,
                                         new_path, rerun_indices, seed=RANDOM_STATE,
                                         system_content=AMAZON_CONTENT_SYSTEM):
    # Load the original predictions
    original_data = pd.read_csv(save_path)
    original_data.columns = ['user_id', 'item_id', 'title', 'actual_rating', 'predicted_rating']

    # Re-seed for reproducibility
    random.seed(seed)

    # Map unique users and items to their indices for quick access
    unique_users = data[user_column_name].unique()
    unique_items = data[movie_id_column].unique()
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(unique_items)}

    for index in rerun_indices:
        user_id = original_data.at[index, 'user_id']
        item_id = original_data.at[index, 'item_id']
        user_idx = user_id_to_index.get(user_id)
        item_idx = item_id_to_index.get(item_id)

        if user_idx is None or item_idx is None:
            print(f"User ID: {user_id} or Item ID: {item_id} not found in index. Skipping.")
            continue

        print(f"Rerunning prediction for User ID: {user_id}, Item ID: {item_id} (Index: {index})")

        # Retrieve user's and item's data
        user_data = data[data[user_column_name] == user_id]
        item_data = data[data[movie_id_column] == item_id]

        if item_data.empty:
            print(f"Item data for ID: {item_id} not found. Skipping.")
            continue

        # Sample user's historical ratings
        if len(user_data) < num_main_user_ratings:
            main_user_ratings = user_data
        else:
            main_user_ratings = user_data.sample(n=num_main_user_ratings, random_state=seed)

        # Construct the context from the user's ratings
        main_user_ratings_str = '\n'.join([
            f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars"
            for _, row in main_user_ratings.iterrows()
        ])

        # Identify similar users and items
        similar_users_idx = np.argsort(-user_pcc_matrix[user_idx])[:num_similar_users + 1]
        similar_users_idx = similar_users_idx[similar_users_idx != user_idx][:num_similar_users]

        similar_items_idx = np.argsort(-item_pcc_matrix[item_idx])[:num_similar_users + 1]
        similar_items_idx = similar_items_idx[similar_items_idx != item_idx][:num_similar_users]

        # Compile ratings from similar users and items
        similar_users_ratings = ""
        for idx in similar_users_idx:
            similar_user_id = unique_users[idx]
            similar_user_data = data[data[user_column_name] == similar_user_id]
            for _, row in similar_user_data.iterrows():
                similar_users_ratings += f"* Title: {row[movie_column_name]}, Rating: {row[rating_column_name]} stars\n"

        # Predict the rating
        combined_text = f"Title: {item_data.iloc[0][movie_column_name]}"
        prompt = f"Main User Ratings:\n{main_user_ratings_str}\n\nSimilar Users' Ratings:\n{similar_users_ratings}\n\nPredict rating for '{combined_text}':"
        predicted_rating = predict_rating_combined_ChatCompletion(
            combined_text, approach="CF", similar_users_ratings=similar_users_ratings,
            rating_history=main_user_ratings_str, system_content=system_content
        )

        # Update the original data with the new prediction
        original_data.at[index, 'predicted_rating'] = predicted_rating
        print(f"Updated prediction for User ID: {user_id}, Item ID: {item_id}: {predicted_rating}")

    # Save the updated predictions to a new file
    original_data.to_csv(new_path, index=False)
    print(f"Updated predictions saved to {new_path}")

In [None]:
%%time

rerun_failed_CF_item_PCC_predictions_sequential(data, 
                                    user_pcc_matrix=user_pcc_matrix,  
                                    item_pcc_matrix=item_pcc_matrix,
                                    save_path=CF_OUTPUT_TIMESTAMP_PATH,
                                    user_column_name=USER_COLUMN_NAME,
                                    movie_column_name=TITLE_COLUMN_NAME,
                                    movie_id_column=ITEM_ID_COLUMN,
                                    rating_column_name=RATING_COLUMN_NAME, 
                                    num_ratings_per_user=NUM_RATINGS_PER_USER, 
                                    num_similar_users=NUM_SIMILAR_USERS,
                                    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
                                    new_path=CF_RERUN_TIMESTAMP_PATH,
                                    rerun_indices=rerun_indices
                                    )


In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_RERUN_TIMESTAMP_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

# CF using using Matrix Factorization


+ Preparing the user-item interaction matrix.
+ Performing matrix factorization to obtain latent factors for users and items.
+ Calculating similarities between users or items using the latent factors.
+ Selecting similar users or items based on these similarities.
+ Using the information from similar users or items to predict ratings for a given user-item pair.
+ Feeding these predictions into the OpenAI ChatCompletion API as part of a collaborative filtering approach.

In [None]:
# coding=utf-8
# Copyright 2024 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Evaluation of matrix factorization following the protocol of the NCF paper.

Details:
 - Model: Matrix factorization with biases:
     y(u,i) = b + v_{u,1}+v_{i,1}+\sum_{f=2}^d v_{u,f}*v_{i,f}
 - Loss: logistic loss
 - Optimization algorithm: stochastic gradient descent
 - Negatives sampling: Random negatives are added during training
 - Optimization objective (similar to NCF paper)
     argmin_V \sum_{(u,i) \in S} [
          ln(1+exp(-y(u,i)))
        + #neg/|I| * \sum_{j \in I} ln(1+exp(y(u,j)))
        + reg * ||V||_2^2 ]
 - Evaluation follows the protocol from:
   He, X., Liao, L., Zhang, H., Nie, L., Hu, X., and Chua, T.-S.: Neural
   collaborative filtering. WWW 2017
"""

import argparse
# Dataset and evaluation protocols reused from
# https://github.com/hexiangnan/neural_collaborative_filtering
import numpy as np


class MFModel(object):
  """A matrix factorization model trained using SGD and negative sampling."""

  def __init__(self, num_user, num_item, embedding_dim, reg, stddev):
    """Initializes MFModel.

    Args:
      num_user: the total number of users.
      num_item: the total number of items.
      embedding_dim: the embedding dimension.
      reg: the regularization coefficient.
      stddev: embeddings are initialized from a random distribution with this
        standard deviation.
    """
    self.user_embedding = np.random.normal(0, stddev, (num_user, embedding_dim))
    self.item_embedding = np.random.normal(0, stddev, (num_item, embedding_dim))
    self.user_bias = np.zeros([num_user])
    self.item_bias = np.zeros([num_item])
    self.bias = 0.0
    self.reg = reg

  def _predict_one(self, user, item):
    """Predicts the score of a user for an item."""
    return (self.bias + self.user_bias[user] + self.item_bias[item] +
            np.dot(self.user_embedding[user], self.item_embedding[item]))

  def predict(self, pairs, batch_size, verbose):
    """Computes predictions for a given set of user-item pairs.

    Args:
      pairs: A pair of lists (users, items) of the same length.
      batch_size: unused.
      verbose: unused.

    Returns:
      predictions: A list of the same length as users and items, such that
      predictions[i] is the models prediction for (users[i], items[i]).
    """
    del batch_size, verbose
    num_examples = len(pairs[0])
    assert num_examples == len(pairs[1])
    predictions = np.empty(num_examples)
    for i in range(num_examples):
      predictions[i] = self._predict_one(pairs[0][i], pairs[1][i])
    return predictions

  def fit(self, positive_pairs, learning_rate, num_negatives):
    """Trains the model for one epoch.

    Args:
      positive_pairs: an array of shape [n, 2], each row representing a positive
        user-item pair.
      learning_rate: the learning rate to use.
      num_negatives: the number of negative items to sample for each positive.

    Returns:
      The logistic loss averaged across examples.
    """
    # Convert to implicit format and sample negatives.
    user_item_label_matrix = self._convert_ratings_to_implicit_data(
        positive_pairs, num_negatives)
    np.random.shuffle(user_item_label_matrix)

    # Iterate over all examples and perform one SGD step.
    num_examples = user_item_label_matrix.shape[0]
    reg = self.reg
    lr = learning_rate
    sum_of_loss = 0.0
    for i in range(num_examples):
      (user, item, rating) = user_item_label_matrix[i, :]
      user_emb = self.user_embedding[user]
      item_emb = self.item_embedding[item]
      prediction = self._predict_one(user, item)

      if prediction > 0:
        one_plus_exp_minus_pred = 1.0 + np.exp(-prediction)
        sigmoid = 1.0 / one_plus_exp_minus_pred
        this_loss = (np.log(one_plus_exp_minus_pred) +
                     (1.0 - rating) * prediction)
      else:
        exp_pred = np.exp(prediction)
        sigmoid = exp_pred / (1.0 + exp_pred)
        this_loss = -rating * prediction + np.log(1.0 + exp_pred)

      grad = rating - sigmoid

      self.user_embedding[user, :] += lr * (grad * item_emb - reg * user_emb)
      self.item_embedding[item, :] += lr * (grad * user_emb - reg * item_emb)
      self.user_bias[user] += lr * (grad - reg * self.user_bias[user])
      self.item_bias[item] += lr * (grad - reg * self.item_bias[item])
      self.bias += lr * (grad - reg * self.bias)

      sum_of_loss += this_loss

    # Return the mean logistic loss.
    return sum_of_loss / num_examples

  def _convert_ratings_to_implicit_data(self, positive_pairs, num_negatives):
    """Converts a list of positive pairs into a two class dataset.

    Args:
      positive_pairs: an array of shape [n, 2], each row representing a positive
        user-item pair.
      num_negatives: the number of negative items to sample for each positive.
    Returns:
      An array of shape [n*(1 + num_negatives), 3], where each row is a tuple
      (user, item, label). The examples are obtained as follows:
      To each (user, item) pair in positive_pairs correspond:
      * one positive example (user, item, 1)
      * num_negatives negative examples (user, item', 0) where item' is sampled
        uniformly at random.
    """
    num_items = self.item_embedding.shape[0]
    num_pos_examples = positive_pairs.shape[0]
    training_matrix = np.empty([num_pos_examples * (1 + num_negatives), 3],
                               dtype=np.int32)
    index = 0
    for pos_index in range(num_pos_examples):
      u = positive_pairs[pos_index, 0]
      i = positive_pairs[pos_index, 1]

      # Treat the rating as a positive training instance
      training_matrix[index] = [u, i, 1]
      index += 1

      # Add N negatives by sampling random items.
      # This code does not enforce that the sampled negatives are not present in
      # the training data. It is possible that the sampling procedure adds a
      # negative that is already in the set of positives. It is also possible
      # that an item is sampled twice. Both cases should be fine.
      for _ in range(num_negatives):
        j = np.random.randint(num_items)
        training_matrix[index] = [u, j, 0]
        index += 1
    return training_matrix


def evaluate(model, test_ratings, test_negatives, K=10):
  """Helper that calls evaluate from the NCF libraries."""
  (hits, ndcgs) = evaluate_model(model, test_ratings, test_negatives, K=K,
                                 num_thread=1)
  return np.array(hits).mean(), np.array(ndcgs).mean()




In [None]:
# initialize the model
embedding_dim = 20  # or any other number of your choice
reg = 0.0  # Regularization coefficient
stddev = 0.1  # Standard deviation for initialization

model = MFModel(num_users, num_items, embedding_dim, reg, stddev)


In [None]:
%%time
# train the model

# Create mappings for user and item IDs
user_ids = data[USER_COLUMN_NAME].unique()
user_id_mapping = {id: index for index, id in enumerate(user_ids)}
item_ids = data[ITEM_ID_COLUMN].unique()
item_id_mapping = {id: index for index, id in enumerate(item_ids)}

# Apply mappings to the data
data['mapped_user_id'] = data[USER_COLUMN_NAME].map(user_id_mapping)
data['mapped_item_id'] = data[ITEM_ID_COLUMN].map(item_id_mapping)

# Prepare positive_pairs with mapped IDs
positive_pairs = data[['mapped_user_id', 'mapped_item_id']].values

# proceed with training
model.fit(positive_pairs, learning_rate, num_negatives)



In [None]:
# Similarity Scores for Users
def calculate_MF_similarity_user(user_factors):
    """
    Calculate the cosine similarity between users based on their latent factors from MF.
    
    Args:
        user_factors (numpy.ndarray): The matrix of user latent factors.
    
    Returns:
        user_similarity_matrix (numpy.ndarray): A matrix of user-user similarity scores.
    """
    # Normalize user factors to unit vectors
    norms = np.linalg.norm(user_factors, axis=1, keepdims=True)
    normalized_user_factors = user_factors / norms
    # Calculate cosine similarity
    user_similarity_matrix = np.dot(normalized_user_factors, normalized_user_factors.T)
    return user_similarity_matrix


# Similarity Scores for Users
def calculate_MF_similarity_item(item_factors):
    """
    Calculate the cosine similarity between items based on their latent factors from MF.
    
    Args:
        item_factors (numpy.ndarray): The matrix of item latent factors.
    
    Returns:
        item_similarity_matrix (numpy.ndarray): A matrix of item-item similarity scores.
    """
    # Normalize item factors to unit vectors
    norms = np.linalg.norm(item_factors, axis=1, keepdims=True)
    normalized_item_factors = item_factors / norms
    # Calculate cosine similarity
    item_similarity_matrix = np.dot(normalized_item_factors, normalized_item_factors.T)
    return item_similarity_matrix


In [None]:
user_factors = model.user_embedding
item_factors = model.item_embedding

user_similarity_matrix = calculate_MF_similarity_user(user_factors)
item_similarity_matrix = calculate_MF_similarity_item(item_factors)

# Display shapes to verify outputs
(user_factors.shape, item_factors.shape, user_similarity_matrix.shape, item_similarity_matrix.shape)

# Sequential

In [None]:
CF_OUTPUT_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_timestamp/sequential_MF_CF_fewshot_.csv')
print(f'Data path: {CF_OUTPUT_TIMESTAMP_PATH}')

CF_RERUN_TIMESTAMP_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_timestamp/rerun_sequential_MF_CF_fewshot.csv')
print(f'Data path: {CF_RERUN_TIMESTAMP_PATH}')

In [None]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save_sequential(
    data=data, 
    user_pcc_matrix=user_similarity_matrix, 
    item_pcc_matrix=item_similarity_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='sequential',
    save_path=CF_OUTPUT_TIMESTAMP_PATH, 
    seed=RANDOM_STATE
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_TIMESTAMP_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)


# Random Split

In [None]:
CF_OUTPUT_RANDOM_MF_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_random/random_MF_CF_fewshot_.csv')
print(f'Data path: {CF_OUTPUT_TIMESTAMP_PATH}')

CF_RERUN_RANDOM_MF_PATH = os.path.join(DATA_DIR, 'amazon-beauty/output/split_random/rerun_random_MF_CF_fewshot.csv')
print(f'Data path: {CF_RERUN_TIMESTAMP_PATH}')

In [None]:
%%time

results_df = predict_ratings_with_CF_item_PCC_and_save_sequential(
    data=data, 
    user_pcc_matrix=user_similarity_matrix, 
    item_pcc_matrix=item_similarity_matrix,
    user_column_name=USER_COLUMN_NAME, 
    movie_column_name=TITLE_COLUMN_NAME, 
    timestamp_column_name=TIME_STAMP_COLUMN_NAME,
    movie_id_column=ITEM_ID_COLUMN,
    rating_column_name=RATING_COLUMN_NAME, 
    num_ratings_per_user=NUM_RATINGS_PER_USER, 
    num_similar_users=NUM_SIMILAR_USERS,
    num_main_user_ratings=NUM_MAIN_USER_RATINGS,
    test_selection_method='random',
    save_path=CF_OUTPUT_RANDOM_MF_PATH, 
    seed=RANDOM_STATE
)



In [None]:
# Evaluate updated CF model predictions
evaluate_model_predictions_rmse_mae(
    data_path=CF_OUTPUT_RANDOM_MF_PATH,
    num_examples=NUM_EXAMPLES,
    actual_ratings_column='actual_rating',
    predicted_ratings_column='predicted_rating'
)

In [None]:
# 

# Neural Collaborative Filtering (NCF) 

NCF combines Matrix Factorization (MF) and Multi-Layer Perceptron (MLP) to learn user-item interaction patterns. 

In [None]:
import numpy as np

# import theano
# import theano.tensor as T
import keras
from keras import backend as K
from keras import initializations
from keras.regularizers import l1, l2, l1l2
from keras.models import Sequential, Model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten, Dropout
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from time import time
import sys
import argparse


def get_NCF_model(num_users, num_items, mf_dim=10, layers=[10], reg_layers=[0], reg_mf=0):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    
    # Embedding layer
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                  init = init_normal, W_regularizer = l2(reg_mf), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                  init = init_normal, W_regularizer = l2(reg_mf), input_length=1)   

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = layers[0]/2, name = "mlp_embedding_user",
                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = layers[0]/2, name = 'mlp_embedding_item',
                                  init = init_normal, W_regularizer = l2(reg_layers[0]), input_length=1)   
    
    # MF part
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mf_vector = merge([mf_user_latent, mf_item_latent], mode = 'mul') # element-wise multiply

    # MLP part 
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = merge([mlp_user_latent, mlp_item_latent], mode = 'concat')
    for idx in xrange(1, num_layer):
        layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
        mlp_vector = layer(mlp_vector)

    # Concatenate MF and MLP parts
    #mf_vector = Lambda(lambda x: x * alpha)(mf_vector)
    #mlp_vector = Lambda(lambda x : x * (1-alpha))(mlp_vector)
    predict_vector = merge([mf_vector, mlp_vector], mode = 'concat')
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = "prediction")(predict_vector)
    
    model = Model(input=[user_input, item_input], 
                  output=prediction)
    
    return model

In [None]:

mf_dim = 8  # Dimensionality of MF embeddings
layers = [64, 32, 16, 8]  # MLP layers
reg_layers = [0, 0, 0, 0]  # Regularization for MLP layers
reg_mf = 0  # Regularization for MF embeddings

model = get_NCF_model(num_users, num_items, mf_dim, layers, reg_layers, reg_mf)


In [None]:
# compile the model
learning_rate = 0.001  # Learning rate
optimizer = Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss='binary_crossentropy')


In [None]:
# train the model
user_input, item_input, labels = get_train_instances(train, num_negatives)
model.fit([np.array(user_input), np.array(item_input)], np.array(labels), epochs=num_epochs, batch_size=batch_size, verbose=1)


# Deep Learning

In [None]:
%conda install tensorflow --quiet
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.optimizers import Adam


def NCF(num_users, num_items, embedding_size, layers=[64, 32, 16, 8]):
    # GMF part
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')
    
    GMF_user_embedding = Embedding(output_dim=embedding_size, input_dim=num_users, input_length=1, name='gmf_user_embedding')(user_input)
    GMF_item_embedding = Embedding(output_dim=embedding_size, input_dim=num_items, input_length=1, name='gmf_item_embedding')(item_input)
    
    GMF_user_flatten = Flatten()(GMF_user_embedding)
    GMF_item_flatten = Flatten()(GMF_item_embedding)
    GMF_multiply = Dot(axes=1)([GMF_user_flatten, GMF_item_flatten])
    
    # MLP part
    MLP_user_embedding = Embedding(output_dim=layers[0] // 2, input_dim=num_users, input_length=1, name='mlp_user_embedding')(user_input)
    MLP_item_embedding = Embedding(output_dim=layers[0] // 2, input_dim=num_items, input_length=1, name='mlp_item_embedding')(item_input)
    
    MLP_user_flatten = Flatten()(MLP_user_embedding)
    MLP_item_flatten = Flatten()(MLP_item_embedding)
    MLP_concat = Concatenate()([MLP_user_flatten, MLP_item_flatten])
    
    for idx in range(1, len(layers)):
        layer = Dense(layers[idx], activation='relu', name=f'layer{idx}')(MLP_concat if idx == 1 else layer)
    
    # Concatenate GMF and MLP parts
    concat = Concatenate()([GMF_multiply, layer])
    output = Dense(1, activation='sigmoid', name='output')(concat)
    
    model = Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


def get_embeddings_deep_learning(user_item_pairs, num_users, num_items, embedding_size=8):
    # Assuming user_item_pairs is a list of tuples (user_id, item_id, interaction_flag)
    user_ids = np.array([u for u, _, _ in user_item_pairs])
    item_ids = np.array([i for _, i, _ in user_item_pairs])
    labels = np.array([label for _, _, label in user_item_pairs])
    
    model = NCF(num_users, num_items, embedding_size)
    
    # Fit the model on your data
    model.fit([user_ids, item_ids], labels, epochs=5, batch_size=64, verbose=1)
    
    # Extract embeddings
    user_embedding_model = Model(inputs=model.input[0], outputs=model.get_layer('gmf_user_embedding').output)
    item_embedding_model = Model(inputs=model.input[1], outputs=model.get_layer('gmf_item_embedding').output)
    
    user_embeddings = user_embedding_model.predict(np.arange(num_users))
    item_embeddings = item_embedding_model.predict(np.arange(num_items))
    
    return user_embeddings, item_embeddings


In [None]:

# Extract user and item IDs
user_ids = data[USER_COLUMN_NAME].unique()
item_ids = data[ITEM_ID_COLUMN].unique()
num_users = len(user_ids)
num_items = len(item_ids)

# Map user and item IDs to their corresponding matrix indices
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
item_id_to_index = {item_id: index for index, item_id in enumerate(item_ids)}

# Generate interaction pairs for the get_embeddings_deep_learning function
interaction_pairs = []
for _, row in data.iterrows():
    user_index = user_id_to_index[row[USER_COLUMN_NAME]]
    item_index = item_id_to_index[row[ITEM_ID_COLUMN]]
    rating = row[RATING_COLUMN_NAME]
    interaction_pairs.append((user_index, item_index, rating))

user_embeddings, item_embeddings = get_embeddings_deep_learning(interaction_pairs, num_users, num_items)


# Explainable Embeddings