In [14]:
import openai
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
# https://github.com/scikit-learn/scikit-learn/blob/d99b728b3/sklearn/metrics/pairwise.py#L1534
from sklearn.metrics.pairwise import cosine_similarity
import sys
import os
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *

# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/rating_prediction.ipynb"))
print(f"current directory: {current_dir}")
# Construct the path to data file
data_path = os.path.join(current_dir, 'large_merged_data.csv')
print(f'data path: {data_path}')

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty
data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv


### Steps
+ Loading and preprocessing data from a CSV file.
+ Feature extraction using TF-IDF (Term Frequency-Inverse Document Frequency) for product titles.
+ Computing pairwise similarities between product titles using cosine similarity.
+ Generating recommendations based on these similarities.
+ Computing precision, recall, and F1 score for the recommendations.

In [15]:
# Read the data
amazon_data = pd.read_csv(data_path)
# get sample data of NUM_SAMPLES rows
amazon_data.info()
# get necessary columns
amazon_data = amazon_data[['title', 'rating', 'reviewText', 'reviewerID', 'category']]
amazon_data.head(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9767 entries, 0 to 9766
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   rating      9767 non-null   float64
 1   reviewerID  9767 non-null   object 
 2   asin        9767 non-null   object 
 3   reviewText  9759 non-null   object 
 4   summary     9759 non-null   object 
 5   category    9767 non-null   object 
 6   title       9767 non-null   object 
dtypes: float64(1), object(6)
memory usage: 534.3+ KB


Unnamed: 0,title,rating,reviewText,reviewerID,category
0,Jenna Jameson Heartbreaker Perfume for women 3...,1.0,"I use a lot of perfume, I go through a new bot...",A2RYSCZOPEXOCQ,[]
1,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI,[]
2,Norelco 6885XL Deluxe Quadra Action Cord/Cordl...,5.0,"First, a little background. I've switched bet...",A141OPVE376YFI,[]


In [16]:
%%time

# Load and preprocess data
data = amazon_data.dropna()
corpus = data["title"].tolist()
X_train, X_test = train_test_split(data, test_size=0.2)

# Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train["title"].tolist())
X_test_features = vectorizer.transform(X_test["title"].tolist())

# Compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X_test_features, X_train_features)

# Generate recommendations for each test user
recommendations = []
for i in range(len(X_test)):
    # Get the index of the user input in the corpus
    user_input = X_test.iloc[i]["title"]
    user_index = corpus.index(user_input)
    # Sort the similarity matrix in descending order and get the top 5 most similar items
    recommended_items = similarity_matrix[i].argsort()[:-6:-1]
    recommendations.append(recommended_items)

recommendations

CPU times: user 910 ms, sys: 73.3 ms, total: 984 ms
Wall time: 1.01 s


[array([   0, 4386, 6715, 6716, 4384]),
 array([7251, 5046, 3199, 3203, 1507]),
 array([5388, 5434, 1370, 7447,  923]),
 array([   0, 4386, 6715, 6716, 4384]),
 array([ 153, 6727, 4221, 3488, 7454]),
 array([6352, 3825, 1701, 5523, 3848]),
 array([ 273, 5764, 3506, 2856,  666]),
 array([6352, 3825, 1701, 5523, 3848]),
 array([2918, 2377, 3785, 1412, 3780]),
 array([2953, 7313,  545, 7142, 7056]),
 array([3390, 2723, 2801, 1595,  491]),
 array([   0, 4386, 6715, 6716, 4384]),
 array([6352, 3825, 1701, 5523, 3848]),
 array([1480, 7767,  191, 3555, 2165]),
 array([6737, 3842, 1408, 5869, 4098]),
 array([5173, 2574, 2313,  256, 1013]),
 array([ 862, 6907, 1008,  237, 4985]),
 array([2156, 4035, 7152, 4013,  365]),
 array([2156, 4035, 7152, 4013,  365]),
 array([6352, 3825, 1701, 5523, 3848]),
 array([2156, 4035, 7152, 4013,  365]),
 array([6179, 6670, 4422,  627, 7613]),
 array([6352, 3825, 1701, 5523, 3848]),
 array([1483, 6768, 7273,  378, 6398]),
 array([   0, 4386, 6715, 6716, 4384]),


In [17]:
# Compute precision and recall
relevant_items = []
for i in range(len(X_test)):
    # Get the index of the user input in the corpus
    relevant_items.append(set(X_train.loc[X_train["category"] == X_test.iloc[i]["category"]]["reviewerID"].tolist()))
recommended_items = [set(X_train.iloc[rec]["reviewerID"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

# Print evaluation metrics
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

Precision: 1.000
Recall: 0.003
F1 Score: 0.006


### -------> OBSERVATION

+ The high precision indicates that the items it recommends are almost always relevant to the user (or in the context of this code, they belong to the same category as the test instance).

+ The low recall suggests that the system is very conservative in its recommendations. It's not recommending a large number of items that it should be.