In [1]:
import numpy as np
import openai
import pandas as pd
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sys
import re
import time
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import RANDOM_STATE, OPENAI_API_KEY

# OpenAI API Key
openai.api_key = OPENAI_API_KEY
# OpenAI GPT Model parameters
GPT_MODEL_NAME = "gpt-3.5-turbo"
TEMPERATURE = 0

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/rating_prediction.ipynb"))
print(f"current directory: {current_dir}")

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty


# RMSE & MAE evaluation metrics

In [2]:
# calculate RMSE and MAE manually
def calculate_rmse_and_mae(actual_ratings, predicted_ratings):
    differences = [actual - predicted for actual, predicted in zip(actual_ratings, predicted_ratings)]
    
    # RMSE
    squared_differences = [diff ** 2 for diff in differences]
    mean_squared_difference = sum(squared_differences) / len(squared_differences)
    rmse = mean_squared_difference ** 0.5

    # MAE
    absolute_differences = [abs(diff) for diff in differences]
    mae = sum(absolute_differences) / len(absolute_differences)

    return rmse, mae

# Test
actual_ratings = [4, 4]  # Ground truth ratings
predicted_ratings = [3, 5]  # Predicted ratings
rmse, mae = calculate_rmse_and_mae(actual_ratings, predicted_ratings)
print("RMSE: ", rmse)
print("MAE: ", mae)


RMSE:  1.0
MAE:  1.0


# Data Overview

In [3]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'large_merged_data.csv')
print(f'data path: {data_path}')

data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv


In [4]:
# Read the data
amazon_data = pd.read_csv(data_path)
# get sample data of NUM_SAMPLES rows
amazon_data.info()
# get neccessary columns
amazon_data = amazon_data[['title', 'rating', 'reviewText', 'reviewerID']]
amazon_data.head(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9767 entries, 0 to 9766
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   rating      9767 non-null   float64
 1   reviewerID  9767 non-null   object 
 2   asin        9767 non-null   object 
 3   reviewText  9759 non-null   object 
 4   summary     9759 non-null   object 
 5   title       9767 non-null   object 
dtypes: float64(1), object(5)
memory usage: 458.0+ KB


Unnamed: 0,title,rating,reviewText,reviewerID
0,Jenna Jameson Heartbreaker Perfume for women 3...,1.0,"I use a lot of perfume, I go through a new bot...",A2RYSCZOPEXOCQ
1,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI
2,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI


# Simple statistical methods (mean calculations) 

+ **Zero-Shot Prediction (zero_shot_predict function):**
This method calculates the average rating for a given product title from the `amazon_data` DataFrame.
It does not take into account any user-specific information and predicts the rating based on the average rating of the product across all users.

+ **Few-Shot Prediction (few_shot_predict function):**
This method calculates the average rating for a given product title by a specific user from the `amazon_data` DataFrame.
It predicts the rating based on the average rating of the product by that specific user, thus incorporating user-specific information.

In [5]:
%%time 

def zero_shot_predict(product_title):
    # Get the average rating for the given title
    avg_rating = amazon_data.loc[amazon_data['title'] == product_title, 'rating'].mean()
    return avg_rating

def few_shot_predict(product_title, user_id):
    # Get the average rating for the given title by the specific user
    avg_rating = amazon_data.loc[(amazon_data['title'] == product_title) & (amazon_data['reviewerID'] == user_id), 'rating'].mean()
    return avg_rating

# Applying predictions
amazon_data['predicted_rating_zero_shot'] = amazon_data['title'].apply(zero_shot_predict)
amazon_data['predicted_rating_few_shot'] = [few_shot_predict(row['title'], row['reviewerID']) for _, row in amazon_data.iterrows()]

# Calculating RMSE and MAE for Zero-Shot
rmse_zero_shot = mean_squared_error(amazon_data['rating'], amazon_data['predicted_rating_zero_shot'], squared=False)
mae_zero_shot = mean_absolute_error(amazon_data['rating'], amazon_data['predicted_rating_zero_shot'])

# Calculating RMSE and MAE for Few-Shot
rmse_few_shot = mean_squared_error(amazon_data['rating'], amazon_data['predicted_rating_few_shot'], squared=False)
mae_few_shot = mean_absolute_error(amazon_data['rating'], amazon_data['predicted_rating_few_shot'])

print(f"Zero-Shot RMSE: {rmse_zero_shot}, MAE: {mae_zero_shot}")
print(f"Few-Shot RMSE: {rmse_few_shot}, MAE: {mae_few_shot}")

amazon_data.head(3)



Zero-Shot RMSE: 0.6822555528345602, MAE: 0.33366073923025447
Few-Shot RMSE: 0.08738080415620557, MAE: 0.007011706085116549
CPU times: user 20.9 s, sys: 79.4 ms, total: 21 s
Wall time: 21 s


Unnamed: 0,title,rating,reviewText,reviewerID,predicted_rating_zero_shot,predicted_rating_few_shot
0,Jenna Jameson Heartbreaker Perfume for women 3...,1.0,"I use a lot of perfume, I go through a new bot...",A2RYSCZOPEXOCQ,1.0,1.0
1,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI,5.0,5.0
2,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI,5.0,5.0


# Zero-shot (GPT-3.5-turbo)

In [6]:
data = amazon_data

In [7]:
%%time
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def predict_rating(title, model=GPT_MODEL_NAME, temperature=TEMPERATURE):
    prompt = f"How will users rate this product title: '{title}'? (1 being lowest and 5 being highest) Attention! Just give me back the exact whole number as a result, and you don't need a lot of text."

    response = openai.ChatCompletion.create(
        model=model,
        temperature=temperature,
        messages=[
            {
                "role": "system",
                "content": "You are an Amazon Beauty products critic."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    rating_text = response.choices[0].message['content'].strip()
    try:
        # Extract the first numerical value from the response
        rating = float(re.search(r'\d+', rating_text).group())  # Only capture whole numbers
        if not (1 <= rating <= 5):
            raise ValueError("Rating out of bounds")
    except (ValueError, AttributeError):
        print(f"Unexpected response for '{product_title}': {rating_text}")
        rating = 0  # Set default value to 0 for unexpected responses

    return rating

# Parameters
PAUSE_EVERY_N = 20
SLEEP_TIME = 60 # Sleep for 60 seconds

# Iterate through the dataset and predict ratings
predicted_ratings = []
titles = data['title'].unique()
for idx, title in enumerate(titles):
    predicted_rating = predict_rating(title)
    print(f"Predicted rating for {title}: {predicted_rating}")
    predicted_ratings.append(predicted_rating)
    
    # Pause every PAUSE_EVERY_N rows
    if (idx + 1) % PAUSE_EVERY_N == 0:
        print(f"Pausing for {SLEEP_TIME} seconds...")
        time.sleep(SLEEP_TIME)

# Create a DataFrame with titles and predicted ratings
predicted_ratings_df = pd.DataFrame({
    'title': titles,
    'zero_shot_predicted_rating': predicted_ratings
})

# Merge the predicted ratings with the original data
merged_data_with_predictions = pd.merge(data, predicted_ratings_df, on='title')

# Save the merged data with predictions to a new CSV file
merged_data_with_predictions.to_csv('../../data/amazon-beauty/predictions_zero_shot.csv', index=False)


Predicted rating for Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray: 4.0
Predicted rating for Norelco 6885XL Deluxe Quadra Action Cord/Cordless Rechargeable Men's Shaver: 4.0
Predicted rating for Philips Norelco HQ5 Shaving Heads: 4.0
Predicted rating for Braun Clean &amp; Renew Refill Cartridges CCR - 2 Count (Packaging May Vary): 4.0
Predicted rating for Braun 3000 Series InterFace/Interface Excel Replacement Pack: 4.0
Predicted rating for Philips Norelco HQ167 Cool Skin Replacement Heads for 6700 Series: 4.0
Predicted rating for Scope Original Mint Mouthwash 50.7 Fl Oz: 4.0
Predicted rating for Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards: 4.0
Predicted rating for Aqua Velva After Shave, Classic Ice Blue, 7 Ounce: 4.0
Predicted rating for L'Oreal Studio Line Lasting Curls, Curl Enhancing , 6 oz (170 g): 4.0
Predicted rating for Panasonic Bikini Shaper and Trimmer for Women ES246AC; 

In [8]:
# evaluate the rating prediction model

product_titles = merged_data_with_predictions['title']
actual_ratings = merged_data_with_predictions['rating']

# Remove None predictions if any
actual_ratings_filtered, predicted_ratings_filtered = zip(*[(actual, predicted) for actual, predicted in zip(actual_ratings, predicted_ratings) if predicted is not None])

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings_filtered, predicted_ratings_filtered))
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Calculate MAE
mae = mean_absolute_error(actual_ratings_filtered, predicted_ratings_filtered)
print(f'Mean Absolute Error (MAE): {mae}')


Root Mean Squared Error (RMSE): 1.1343651866449433
Mean Absolute Error (MAE): 1.0117710005350455


In [9]:
# calculate RMSE and MAE manually using calculate_rmse_and_mae function
rmse, mae = calculate_rmse_and_mae(
    actual_ratings_filtered, predicted_ratings_filtered
)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 1.1343651866449433
MAE: 1.0117710005350455


# Few-shot


For each user, we'll use 4 of their ratings as training data to predict ratings for the rest of their products. Finally, we'll evaluate the predictions against the actual ratings to calculate the overall RMSE and MAE.

In [10]:
%%time
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def predict_rating_few_shot(product_title, rating_history, model=GPT_MODEL_NAME, temperature=TEMPERATURE):
    """
    Predict the rating of a product based on user's past rating history using the GPT model.
    
    Parameters:
    - product_title (str): The title of the product for which rating needs to be predicted.
    - rating_history (str): A string representation of user's past product ratings.
    - model (str): The GPT model version to use.
    - temperature (float): Sampling temperature for the model response. 
    
    Returns:
    - float: Predicted rating for the product or None if the response is not valid.
    """
    # Construct the prompt to ask the model
    prompt = (f"Here is user rating history: {rating_history}; "
              f"Based on the above rating history, please predict user's rating for the product: '{product_title}', "
              "(1 being lowest and 5 being highest.). "
              "Attention! Just give me back the exact whole number as a result, and you don't need a lot of text.")
    
    # Make the API call
    response = openai.ChatCompletion.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": "You are a product critic."},
            {"role": "user", "content": prompt}
        ]
    )
    
    rating_text = response.choices[0].message['content'].strip()
    try:
        # Extract the first numerical value from the response
        rating = float(re.search(r'\d+?', rating_text).group())
        if not (0.5 <= rating <= 5.0):
            raise ValueError("Rating out of bounds")
    except (ValueError, AttributeError):
        print(f"Unexpected response for '{product_title}': {rating_text}")
        rating = 0  # Set default value to 0 for unexpected responses

    return rating

# Parameters for pausing
PAUSE_EVERY_N_USERS = 10
SLEEP_TIME = 60  # Sleep for 10 seconds
predicted_ratings = []
actual_ratings = []

# For each user in the dataset
users = data['reviewerID'].unique()
for idx, reviewerID in enumerate(users):
    user_data = data[data['reviewerID'] == reviewerID]
    for reviewerID in data['reviewerID'].unique():
    user_data = data[data['reviewerID'] == reviewerID]
    
    # Check if the user has at least 5 ratings
    if len(user_data) >= 5:
        train_data = user_data.sample(4, random_state=RANDOM_STATE)
        test_data = user_data.drop(train_data.index)

        # For each product in the testing set, use the training data to predict a rating
        for _, test_row in test_data.iterrows():
            rating_history_str = ', '.join([f"{row['title']} ({row['rating']} stars)" for _, row in train_data.iterrows()])
            predicted_rating = predict_rating_few_shot(test_row['title'], rating_history_str)
            
            predicted_ratings.append(predicted_rating)
            actual_ratings.append(test_row['rating'])
    
    # Introduce a pause after processing every PAUSE_EVERY_N_USERS
    if (idx + 1) % PAUSE_EVERY_N_USERS == 0:
        print(f"Processed {idx + 1} users. Pausing for {SLEEP_TIME} seconds...")
        time.sleep(SLEEP_TIME)

# Save the predicted ratings to a new CSV file
predicted_ratings_df = pd.DataFrame({
    'few_shot_predicted_rating': predicted_ratings,
    'actual_rating': actual_ratings
})
predicted_ratings_df.to_csv('../../data/amazon-beauty/large_predictions_few_shot.csv', index=False)

predicted_ratings_df.head(3)

IndentationError: expected an indented block after 'for' statement on line 53 (<unknown>, line 54)

In [None]:
predicted_ratings

[1.0, 1.0, 2.0, 2.0, 2.0, 5.0, 1.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0]

In [None]:
filtered_list = [(actual, predicted) for actual, predicted in zip(actual_ratings, predicted_ratings) if predicted is not None]

if not filtered_list:
    print("No valid predictions available for evaluation.")
else:
    actual_ratings_filtered, predicted_ratings_filtered = zip(*filtered_list)
    # Evaluate the model's performance
    rmse = np.sqrt(mean_squared_error(actual_ratings_filtered, predicted_ratings_filtered))
    print(f'Root Mean Squared Error (RMSE): {rmse}')

    mae = mean_absolute_error(actual_ratings_filtered, predicted_ratings_filtered)
    print(f'Mean Absolute Error (MAE): {mae}')


Root Mean Squared Error (RMSE): 1.9086270308410553
Mean Absolute Error (MAE): 1.0714285714285714


In [None]:
# demonstrate the output of 4 random historical ratings
predicted_ratings = []
for title in data['title'].unique():
    # Randomly sample 4 rows from the entire dataset
    rating_history_samples = data.sample(4)
    rating_history_str = ', '.join([f"{row['title']} ({row['rating']} stars)" for _, row in rating_history_samples.iterrows()])

rating_history_str


'Bath &amp; Body Works Ile De Tahiti Moana Coconut Vanille Moana Body Wash with Tamanoi 8.5 oz (5.0 stars), Citre Shine Moisture Burst Shampoo - 16 fl oz (2.0 stars), Oud Wood Oil. IMPRESSION of Tom Ford Oud Wood* Cologne for Men with SIMILAR Fragrance Accords, 10ml Amber Glass Roller, Black Cap; 100% Pure (Perfume Studio Oud Wood VERSION/TYPE; Not Original Brand) (2.0 stars), Astra Platinum Double Edge Safety Razor Blades ,100 Blades (20 x 5) (5.0 stars)'

# References

+ https://platform.openai.com/docs/api-reference/authentication