In [None]:
# Step 1. Data Collection and Preprocessing
#The first step is to load and preprocess the data. 
#We collect product data, customer profiles, purchase history, and browsing behavior from the e-commerce platform.

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

# File paths
customer_path = 'D:/DATA ANALYSIS/Recommnder/customer.csv'
transactions_path = 'D:/DATA ANALYSIS/Recommnder/transactions.csv'
prod_cat_info_path = 'D:/DATA ANALYSIS/Recommnder/prod_cat_info.csv'

# Load datasets
customers = pd.read_csv(customer_path)
transactions = pd.read_csv(transactions_path)
prod_cat_info = pd.read_csv(prod_cat_info_path)

# Convert date columns to datetime format
customers['DOB'] = pd.to_datetime(customers['DOB'], format='%d-%m-%Y')
transactions['tran_date'] = pd.to_datetime(transactions['tran_date'], dayfirst=True, errors='coerce')

# Handle negative values in 'Qty' and 'total_amt' (treat as returns)
transactions['Qty'] = transactions['Qty'].abs()
transactions['total_amt'] = transactions['total_amt'].abs()

# Merge datasets
merged_df = transactions.merge(customers, left_on='cust_id', right_on='customer_Id')
merged_df = merged_df.merge(prod_cat_info, on='prod_cat_code')

# Display the merged data
print(merged_df.head())


   transaction_id  cust_id  tran_date  prod_subcat_code  prod_cat_code  Qty  \
0     80712190438   270351 2014-02-28                 1              1    5   
1     80712190438   270351 2014-02-28                 1              1    5   
2     80712190438   270351 2014-02-28                 1              1    5   
3     80712190438   270351 2014-02-20                 1              1    5   
4     80712190438   270351 2014-02-20                 1              1    5   

   Rate    Tax  total_amt Store_type  customer_Id        DOB Gender  \
0  -772  405.3     4265.3     e-Shop       270351 1981-09-26      M   
1  -772  405.3     4265.3     e-Shop       270351 1981-09-26      M   
2  -772  405.3     4265.3     e-Shop       270351 1981-09-26      M   
3   772  405.3     4265.3     e-Shop       270351 1981-09-26      M   
4   772  405.3     4265.3     e-Shop       270351 1981-09-26      M   

   city_code  prod_cat  prod_sub_cat_code prod_subcat  
0        5.0  Clothing                  4 

In [5]:
# NOTE- Microsoft Visual C++ Build Tools are required to install the scikit-surprise library on Windows
pip install scikit-surprise

Note: you may need to restart the kernel to use updated packages.
Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-win_amd64.whl size=1297495 sha256=884feb53b9a89eb45415a340afd2130b694c989ee3758f767057c5818308fe64
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\2a\8f\6e\7e2899163e2d85d826

In [None]:
# Step. 2 2. Recommendation Techniques
#Collaborative Filtering (SVD Model)

#We implement a Collaborative Filtering technique using Singular Value Decomposition (SVD). 
#This method helps predict missing ratings based on the patterns of past behavior between users and products.

In [4]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate

# Prepare data for collaborative filtering
reader = Reader(rating_scale=(0, merged_df['total_amt'].max()))
data = Dataset.load_from_df(merged_df[['cust_id', 'prod_subcat_code', 'total_amt']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
model = SVD()
model.fit(trainset)

# Evaluate the model using cross-validation
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    6018.77776021.86785982.02905986.29845992.64146000.322816.7040 
MAE (testset)     5689.12035692.74085646.86675652.68275660.58945668.400018.9395 
Fit time          1.11    1.00    0.97    0.98    1.01    1.01    0.05    
Test time         0.27    0.14    0.14    0.26    0.16    0.19    0.06    


{'test_rmse': array([6018.77765649, 6021.86775691, 5982.0290299 , 5986.29835362,
        5992.64141421]),
 'test_mae': array([5689.12034896, 5692.74076942, 5646.8666602 , 5652.68269992,
        5660.58936499]),
 'fit_time': (1.1141057014465332,
  1.0000813007354736,
  0.9740769863128662,
  0.9750781059265137,
  1.0100939273834229),
 'test_time': (0.2682201862335205,
  0.14301013946533203,
  0.13901090621948242,
  0.2630188465118408,
  0.15599846839904785)}

In [None]:
# Content-Based Filtering (TF-IDF and Cosine Similarity)
#We also implement Content-Based Filtering using TF-IDF vectorization to convert product descriptions into numerical form.
#We then calculate the cosine similarity to recommend similar products.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create product descriptions by combining category and subcategory
prod_cat_info['description'] = prod_cat_info['prod_cat'] + ' ' + prod_cat_info['prod_subcat']

# Vectorize the product descriptions
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(prod_cat_info['description'])

# Compute cosine similarity between products
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend products based on a given product
def recommend_products(product_id, num_recommendations=5):
    idx = prod_cat_info.index[prod_cat_info['prod_cat_code'] == product_id][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    product_indices = [i[0] for i in sim_scores]
    return prod_cat_info.iloc[product_indices]

# Example: Recommend products similar to product with ID 1
print(recommend_products(1))


    prod_cat_code  prod_cat  prod_sub_cat_code prod_subcat     description
1               1  Clothing                  1       Women  Clothing Women
3               2  Footwear                  1        Mens   Footwear Mens
2               1  Clothing                  3        Kids   Clothing Kids
11              4      Bags                  1        Mens       Bags Mens
4               2  Footwear                  3       Women  Footwear Women


In [None]:
#Step 3. Model Evaluation

#Finally, we evaluate the performance of the recommender system using various evaluation metrics 
#such as precision, recall, F1-score, and mean average precision (MAP).

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
import numpy as np

# Get predictions from the collaborative filtering model
predictions = model.test(testset)

# Extract the actual and predicted ratings from the predictions
y_true = np.array([pred.r_ui for pred in predictions])  # Actual ratings from the test set
y_pred = np.array([pred.est for pred in predictions])  # Predicted ratings

# Convert ratings to binary values (1 if the predicted rating is above a threshold, otherwise 0)
threshold = 0.5  # You can adjust this threshold as needed based on your dataset
y_true_binary = (y_true > threshold).astype(int)
y_pred_binary = (y_pred > threshold).astype(int)

# Calculate Precision, Recall, F1-score, and Mean Average Precision (MAP)
precision = precision_score(y_true_binary, y_pred_binary)
recall = recall_score(y_true_binary, y_pred_binary)
f1 = f1_score(y_true_binary, y_pred_binary)
map_score = average_precision_score(y_true_binary, y_pred_binary)

# Print the evaluation results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
print(f'Mean Average Precision: {map_score:.4f}')


Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Mean Average Precision: 1.0000
