In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
import sys
import os
import openai
import pandas as pd

# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/rating_prediction.ipynb"))
print(f"current directory: {current_dir}")
# Construct the path to data file
data_path = os.path.join(current_dir, 'large_merged_data.csv')
print(f'data path: {data_path}')

def split_data_by_rated_items(df, user_col, test_size, given_n, random_state=RANDOM_STATE):
    """
    Splits the data into a training set and a test set. For each user in the test set, 
    it keeps only a given number of rated items.
    """
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df[user_col])
    test_df = test_df.groupby(user_col).apply(lambda x: x.sample(min(len(x), given_n), random_state=random_state))
    return train_df, test_df.reset_index(drop=True)


def split_data_by_user_percentage(df, user_col, percentages, random_state=None):
    """
    Splits the data into different sets based on percentages of unique users.
    """
    unique_users = df[user_col].unique()
    np.random.seed(random_state)
    np.random.shuffle(unique_users)
    
    total_users = len(unique_users)
    slices = [int(p * total_users) for p in percentages]
    
    # Split the DataFrame into the different sets based on the user IDs
    sets = [df[df[user_col].isin(unique_users[slices[i]:slices[i+1]])] for i in range(len(slices)-1)]

    return sets


def all_but_one(df, user_col, random_state=None):
    """
    For each user, select one rating and split it into a separate DataFrame.
    """
    test_df = df.groupby(user_col).sample(n=1, random_state=random_state)
    train_df = df.drop(test_df.index)
    return train_df, test_df


current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty
data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv


In [9]:
# Read the data
amazon_data = pd.read_csv(data_path)
# get sample data of NUM_SAMPLES rows
amazon_data.info()
# get neccessary columns
df = amazon_data[['title', 'rating', 'reviewText', 'reviewerID', 'category']]
df.head(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9767 entries, 0 to 9766
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   rating      9767 non-null   float64
 1   reviewerID  9767 non-null   object 
 2   asin        9767 non-null   object 
 3   reviewText  9759 non-null   object 
 4   summary     9759 non-null   object 
 5   category    9767 non-null   object 
 6   title       9767 non-null   object 
dtypes: float64(1), object(6)
memory usage: 534.3+ KB


Unnamed: 0,title,rating,reviewText,reviewerID,category
0,Jenna Jameson Heartbreaker Perfume for women 3...,1.0,"I use a lot of perfume, I go through a new bot...",A2RYSCZOPEXOCQ,[]
1,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI,[]
2,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI,[]


In [15]:
# Importing the utility functions for data splitting
# from utils import split_data_by_rated_items, split_data_by_user_percentage, all_but_one

# Split data such that for each user in the test set, we only retain a given number of rated items.
train_df_given_10, test_df_given_10 = split_data_by_rated_items(df, user_col='reviewerID', test_size=0.2, given_n=10, random_state=RANDOM_STATE)

# Split data based on percentages of unique users.
# Here, for example, 5% of users will be in the first set, next 10% in the second set, next 40% in the third set, and the rest in the test set.
sets = split_data_by_user_percentage(df, user_col='reviewerID', percentages=[0, 0.05, 0.15, 0.55, 1], random_state=RANDOM_STATE)
M5_pct_df, M10_pct_df, M40_pct_df, test_df = sets

# Leave-one-out split.
train_df, test_df_all_but_one = all_but_one(df, user_col='reviewerID', random_state=RANDOM_STATE)

# Printing the shapes of the data splits
print(f"train_df_given_10: {train_df_given_10.shape}")
print(f"test_df_given_10: {test_df_given_10.shape}")

print(f"M5_pct_df: {M5_pct_df.shape}")
print(f"M10_pct_df: {M10_pct_df.shape}")
print(f"M40_pct_df: {M40_pct_df.shape}")
print(f"test_df: {test_df.shape}")


train_df_given_10: (7813, 5)
test_df_given_10: (1954, 5)
M5_pct_df: (488, 5)
M10_pct_df: (960, 5)
M40_pct_df: (3956, 5)
test_df: (4363, 5)
