In [1]:
import pandas as pd
from datetime import datetime

# Load the dataset (update with your file path)
file_path = 'Reviews.csv'  # Replace with the path to your downloaded file
data = pd.read_csv(file_path)

# Inspect the dataset
print("Dataset Info:")
print(data.info())

print("\nFirst 5 Rows:")
print(data.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB
None

First 5 Rows:
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZC

In [2]:
# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Drop rows with missing UserId, ProductId, or Score
data = data.dropna(subset=['UserId', 'ProductId', 'Score'])

# Fill missing text columns with an empty string
data['Summary'] = data['Summary'].fillna('')
data['Text'] = data['Text'].fillna('')

Missing Values:
 Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [3]:
from datetime import datetime

# Convert 'Time' column to datetime
data['ReviewTime'] = data['Time'].apply(lambda x: datetime.fromtimestamp(x))

# Extract year and month from the review time
data['Year'] = data['ReviewTime'].dt.year
data['Month'] = data['ReviewTime'].dt.month

In [4]:
# Filter data for the years 2010 to 2012
recent_data = data[data['Year'].isin([2010, 2011, 2012])]

print("Filtered Data Shape:", recent_data.shape)

Filtered Data Shape: (447842, 13)


In [5]:
# Randomly sample 10,000 rows if the filtered data is still large
subset_data = recent_data.sample(n=10000, random_state=42)

print("Subset Shape:", subset_data.shape)

Subset Shape: (10000, 13)


In [6]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Apply SVD to the interaction matrix
interaction_matrix = subset_data.pivot_table(
    index='UserId',
    columns='ProductId',
    values='Score'
).fillna(0)

# Convert the matrix to a NumPy array
interaction_matrix_np = interaction_matrix.values

# Apply SVD
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(interaction_matrix_np)

# Function to recommend top N products for a user
def recommend_products(user_index, svd_matrix, interaction_matrix, top_n=10):
    user_ratings = svd_matrix[user_index]
    product_indices = np.argsort(-user_ratings)[:top_n]  # Get top N indices
    recommended_products = interaction_matrix.columns[product_indices]
    return recommended_products

# Example: Recommend products for the first user in the matrix
user_index = 0  # Adjust as needed
recommended_products = recommend_products(user_index, svd_matrix, interaction_matrix, top_n=10)
print("Recommended Products for User:", recommended_products)

Recommended Products for User: Index(['B0000E5JR0', 'B0000E65WO', 'B0000DJDJZ', 'B0000DK4G4', 'B0000DIYKE',
       'B0000DID60', 'B0000D9MYO', 'B0000DJ4BQ', 'B0000CER9K', 'B0000D9MTS'],
      dtype='object', name='ProductId')


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between products
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend similar products
def recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10):
    similar_indices = np.argsort(-cosine_sim[product_index])[:top_n]  # Get top N indices
    similar_products = [product_ids[i] for i in similar_indices]
    return similar_products

# Example: Recommend products similar to the first product in the dataset
product_index = 0  # Adjust as needed
product_ids = subset_data['ProductId'].unique()
similar_products = recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10)
print("Similar Products to Product:", product_ids[product_index])
print(similar_products)

NameError: name 'tfidf_matrix' is not defined

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine text data to create a richer product description
subset_data['combined_text'] = subset_data['Summary'] + " " + subset_data['Text']

# Create a TF-IDF vectorizer and fit-transform the combined text
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(subset_data['combined_text'])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (10000, 5000)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity using the TF-IDF matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix, dense_output=False)

print("Cosine similarity computed successfully!")

Cosine similarity computed successfully!


In [10]:
import numpy as np

# Function to recommend top N similar products
def recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10):
    # Get similarity scores for the given product
    similarity_scores = cosine_sim[product_index].toarray().flatten()
    
    # Get indices of the top N similar products (excluding itself)
    similar_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]
    
    # Get product IDs of similar products
    similar_products = [product_ids[i] for i in similar_indices]
    
    return similar_products

# Example usage
product_index = 0  # Replace with the index of a product in your dataset
product_ids = subset_data['ProductId'].unique()  # Unique product IDs from the dataset
similar_products = recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10)

print(f"Products similar to product {product_ids[product_index]}: {similar_products}")

IndexError: index 7854 is out of bounds for axis 0 with size 6497

In [11]:
# Updated function to recommend top N similar products
def recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10):
    # Get similarity scores for the given product
    similarity_scores = cosine_sim[product_index].toarray().flatten()
    
    # Exclude the product itself and get indices of the top N similar products
    similar_indices = np.argsort(-similarity_scores)[1:top_n+1]
    
    # Map the indices to product IDs
    similar_products = product_ids[similar_indices]
    
    return similar_products

In [12]:
# Confirm product IDs correspond to the rows in tfidf_matrix
product_ids = subset_data['ProductId'].reset_index(drop=True)

In [13]:
product_index = 0  # Replace with a valid index in your dataset
similar_products = recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10)

print(f"Products similar to product {product_ids[product_index]}: {similar_products}")

Products similar to product B0029NS7BU: 7854    B000F9Z29U
7624    B003QNJYXM
7851    B00451ZJB0
4001    B0033GZMXS
1739    B000WFRUP6
3652    B005HGAV0G
8567    B000YSS7EO
9495    B003VXHGPK
2027    B000ILEITA
8594    B000MXGMIE
Name: ProductId, dtype: object


In [14]:
# Example: Assuming we have a list of actual purchased items per user for evaluation
actual_items = {
    "User1": ["ProductA", "ProductB"],
    "User2": ["ProductC"],
    # Add more users and their purchased products for evaluation
}

# Function to calculate Precision@K
def precision_at_k(recommended, actual, k):
    recommended_at_k = recommended[:k]
    relevant_and_recommended = set(recommended_at_k).intersection(set(actual))
    precision = len(relevant_and_recommended) / k
    return precision

# Example usage
recommended_items = ["ProductA", "ProductD", "ProductE"]  # Top K recommended items
actual_items_user1 = actual_items["User1"]
precision = precision_at_k(recommended_items, actual_items_user1, k=3)
print(f"Precision@3: {precision}")

Precision@3: 0.3333333333333333


In [15]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[K     |████████████████████████████████| 255 kB 6.1 MB/s eta 0:00:01
[?25hCollecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
[K     |████████████████████████████████| 450 kB 49.8 MB/s eta 0:00:01
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 120.0 MB/s eta 0:00:01
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
[K     |████████████████████████████████| 179 kB 58.7 MB/s eta 0:00:01
Collecting typing-extensions>=3.7.4.3
  Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting tokenizers<0.21,>=0.20
  Downloading tokenizers-0.20.3-cp38-cp38-macosx_10_12_x86_64.whl (2.7 MB)
[K     |████████████████████████████████| 2.7 MB 26.0 MB/s eta 0:00:01
Collecting safetensors>=0.4.1
  Downl

In [16]:
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Combine text data for richer descriptions (if not done already)
subset_data['combined_text'] = subset_data['Summary'].fillna('') + " " + subset_data['Text'].fillna('')

# Generate embeddings for the combined text
embeddings = model.encode(subset_data['combined_text'].tolist(), show_progress_bar=True)

print("Embeddings shape:", embeddings.shape)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Embeddings shape: (10000, 384)


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between embeddings
cosine_sim = cosine_similarity(embeddings, embeddings)

print("Cosine similarity matrix computed!")

Cosine similarity matrix computed!


In [18]:
# Function to recommend top N similar products using embeddings
def recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10):
    similarity_scores = cosine_sim[product_index]
    similar_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]  # Top N similar products
    similar_products = [product_ids[i] for i in similar_indices]
    return similar_products

# Example usage
product_index = 0  # Replace with a valid product index
product_ids = subset_data['ProductId'].reset_index(drop=True)  # Ensure alignment
similar_products = recommend_similar_products(product_index, cosine_sim, product_ids, top_n=10)

print(f"Products similar to product {product_ids[product_index]}: {similar_products}")

Products similar to product B0029NS7BU: ['B003VMY488', 'B001E5116C', 'B001BOVDNC', 'B0081XPTBS', 'B004QQ82L8', 'B001BDEI6W', 'B0009X0RA6', 'B001E6IUMY', 'B001M08YZA', 'B005CUU23S']


In [20]:
# Evaluate Precision@3 for a sample user
recommended_items = recommend_similar_products(0, cosine_sim, product_ids, top_n=3)
actual_items_user1 = actual_items["User1"]  # Replace with your actual items
precision = precision_at_k(recommended_items, actual_items_user1, k=3)
print(f"Improved Precision@3: {precision}")

Improved Precision@3: 0.0


In [21]:
# Specify the file path where you want to save the subset data
output_file_path = "subset_data.csv"

# Save the subset data to a CSV file
subset_data.to_csv(output_file_path, index=False)

print(f"Subset data saved successfully to {output_file_path}")

Subset data saved successfully to subset_data.csv
