In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load data
data = pd.read_csv('/kaggle/input/amazon-ratings/ratings_Beauty.csv')
data.drop(['Timestamp'], axis=1)
data = data.iloc[2500:15000]

# Map user and product IDs to numerical indices
user_mapping = {user_id: idx for idx, user_id in enumerate(data['UserId'].unique())}
product_mapping = {product_id: idx for idx, product_id in enumerate(data['ProductId'].unique())}

# Create numerical user and product IDs in the DataFrame
data['UserIndex'] = data['UserId'].map(user_mapping)
data['ProductIndex'] = data['ProductId'].map(product_mapping)

# Create user-item interaction matrix

# data = data.head(10000)
testing = data
interaction_matrix = pd.pivot_table(data, values='Rating', index='UserIndex', columns='ProductIndex', fill_value=0)
interaction_matrix.shape

(11999, 476)

In [10]:
import pickle

# Save user_mapping to a file
with open('user_mapping.pkl', 'wb') as file:
    pickle.dump(user_mapping, file)

In [12]:
import pickle

with open('product_mapping.pkl', 'wb') as file:
    pickle.dump(product_mapping, file)

In [2]:
import pandas as pd

from surprise import SVD
from surprise import dataset
from surprise import Reader

# Apply SVD to get initial embeddings
U, sigma, Vt = np.linalg.svd(interaction_matrix)

# Number of latent features
latent_features = 10

# Truncate U, sigma, and Vt matrices to the desired number of latent features
U = U[:, :latent_features]
sigma = np.diag(sigma[:latent_features])
Vt = Vt[:latent_features, :]

# Initialize user and product embeddings from U and Vt
user_embeddings = U
product_embeddings = Vt.T

# Hyperparameters
learning_rate = 0.01
reg_lambda = 0.01  # Regularization parameter
num_epochs = 10

In [3]:
# You'll need to create a dummy reader
reader = Reader(line_format='user item rating', rating_scale=(1, 5))

# # # Also, a dummy Dataset class
class MyDataset(dataset.DatasetAutoFolds):

    def __init__(self, df, reader):

        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
                            zip(df['UserIndex'], df['ProductIndex'], df['Rating'])]
        self.reader=reader

data = MyDataset(data, reader)

train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

In [None]:
# # Training loop
for epoch in range(51):
    total_loss = 0

    for user_idx, item_idx, rating in train_data.all_ratings():
        predicted_rating = np.dot(user_embeddings[user_idx], product_embeddings[item_idx])
        error = rating - predicted_rating
        
        # Update embeddings using gradient descent
        user_embeddings[user_idx] += learning_rate * (error * product_embeddings[item_idx] - reg_lambda * user_embeddings[user_idx])
        product_embeddings[item_idx] += learning_rate * (error * user_embeddings[user_idx] - reg_lambda * product_embeddings[item_idx])
        
        total_loss += error ** 2

    # Calculate and print average loss for the epoch
    average_loss = total_loss / train_data.n_ratings

In [8]:
np.save('user_embeddings.npy', user_embeddings)
np.save('product_embeddings.npy', product_embeddings)

In [None]:
test_predictions = []
for user_idx, item_idx, _ in test_data:
    predicted_rating = np.dot(user_embeddings[user_idx], product_embeddings[item_idx])
    test_predictions.append(predicted_rating)

# Convert test_predictions to a numpy array
test_predictions = np.array(test_predictions)

# Calculate and print RMSE and MAE
actual_ratings = np.array([rating for _, _, rating in test_data])
rmse = np.sqrt(np.mean((test_predictions - actual_ratings) ** 2))
mae = np.mean(np.abs(test_predictions - actual_ratings))

In [6]:
type(data)

__main__.MyDataset

In [None]:
# Test recommendations
user_index_to_test = user_mapping['AT7EYU8AKND5V']
test_products = list(testing[testing['UserIndex'] == user_index_to_test]['ProductIndex'])

recommendations = []
for product_idx in range(len(product_mapping)):
    if product_idx not in test_products:
        if user_index_to_test < len(user_embeddings) and product_idx < len(product_embeddings):
            predicted_rating = np.dot(user_embeddings[user_index_to_test], product_embeddings[product_idx])
            recommendations.append((product_idx, predicted_rating))

# Rank products based on predicted ratings
recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)

# Print top N recommendations
top_recommendations = recommendations[:10]
for product_idx, predicted_rating in top_recommendations:
    product_id = next(key for key, value in product_mapping.items() if value == product_idx)
    print(f"Recommended Product: {product_id}")
