In [6]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

# Preview the data
print(customers.head())
print(transactions.head())
print(products.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  
  ProductID              ProductName     Category   Price
0      P001

In [7]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")


In [8]:
#  Aggregate Transaction Data and Create customer_profile
# Total spending and product counts for each customer
customer_spending = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count'  # Number of transactions
}).rename(columns={'TotalValue': 'TotalSpending', 'TransactionID': 'TransactionCount'})

# Merge with customer profile data
customer_profile = pd.merge(customers, customer_spending, on='CustomerID', how='left')

# Fill missing values for customers with no transactions
customer_profile['TotalSpending'] = customer_profile['TotalSpending'].fillna(0)
customer_profile['TransactionCount'] = customer_profile['TransactionCount'].fillna(0)

print(customer_profile.head())


  CustomerID        CustomerName         Region  SignupDate  TotalSpending  \
0      C0001    Lawrence Carroll  South America  2022-07-10        3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13        1862.74   
2      C0003      Michael Rivera  South America  2024-03-07        2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09        5354.88   
4      C0005         Laura Weber           Asia  2022-08-15        2034.24   

   TransactionCount  
0               5.0  
1               4.0  
2               4.0  
3               8.0  
4               3.0  


In [10]:
#Normalization
from sklearn.preprocessing import MinMaxScaler

# Select numeric features for similarity
features = ['TotalSpending', 'TransactionCount']
scaler = MinMaxScaler()
customer_profile[features] = scaler.fit_transform(customer_profile[features])

print(customer_profile.head())


  CustomerID        CustomerName         Region  SignupDate  TotalSpending  \
0      C0001    Lawrence Carroll  South America  2022-07-10       0.314274   
1      C0002      Elizabeth Lutz           Asia  2022-02-13       0.174514   
2      C0003      Michael Rivera  South America  2024-03-07       0.255332   
3      C0004  Kathleen Rodriguez  South America  2022-10-09       0.501681   
4      C0005         Laura Weber           Asia  2022-08-15       0.190581   

   TransactionCount  
0          0.454545  
1          0.363636  
2          0.363636  
3          0.727273  
4          0.272727  


In [11]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute similarity matrix
customer_features = customer_profile[features].values
similarity_matrix = cosine_similarity(customer_features)

# Convert to a DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.987625  0.999974  0.999999  0.999988  0.975210   
C0002       0.987625  1.000000  0.986463  0.987794  0.986831  0.928438   
C0003       0.999974  0.986463  1.000000  0.999965  0.999997  0.976787   
C0004       0.999999  0.987794  0.999965  1.000000  0.999982  0.974971   
C0005       0.999988  0.986831  0.999997  0.999982  1.000000  0.976301   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.992783  0.981957  0.953490  0.982319  ...  0.998677  0.993458   
C0002       0.961689  0.999463  0.988964  0.999524  ...  0.994383  0.999074   
C0003       0.993626  0.980562  0.951282  0.980937  ...  0.998278  0.992605   
C0004       0.992653  0.982161  0.953814  0.982520  ...  0.998732  0.993581   
C0005  