In [131]:
import numpy as np
import pandas as pd

In [132]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [133]:
merged_data = pd.read_csv('merged_data.csv')

In [134]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y,Month
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024-08
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024-05
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024-03
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68,2024-03


In [137]:
customer_features = merged_data.groupby('CustomerID').agg({'TotalValue': 'sum',
                                                            'Quantity': 'sum',
                                                            'Category': lambda x: x.mode()[0],
                                                            'Region': 'first'
                                                        }).reset_index()

In [140]:
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'], drop_first=True)

In [142]:
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Category_Clothing,Category_Electronics,Category_Home Decor,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,12,False,True,False,False,False,True
1,C0002,1862.74,10,True,False,False,False,False,False
2,C0003,2725.38,14,False,False,True,False,False,True
3,C0004,5354.88,23,False,False,False,False,False,True
4,C0005,2034.24,7,False,True,False,False,False,False


In [143]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features.drop(columns=['CustomerID']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [148]:
similarity_matrix.shape

(199, 199)

In [154]:
similarity_matrix[0]

array([1.        , 0.99999816, 0.99999867, 0.99999969, 0.99999993,
       0.99999968, 0.99999984, 0.99999925, 0.99999864, 0.99999376,
       0.99999995, 0.99999993, 0.99999977, 0.99998642, 0.99999953,
       0.99999935, 0.99999954, 0.99999987, 0.99999875, 0.99999901,
       0.99999981, 0.99999983, 0.99999952, 0.99999983, 0.99999864,
       0.9999999 , 0.9999993 , 0.99999994, 0.99999755, 0.9999986 ,
       0.99999882, 0.99999183, 0.99987697, 0.9999979 , 0.99999984,
       0.99999954, 0.99999982, 0.999999  , 0.99999977, 0.99999932,
       0.99999988, 0.99999915, 0.99999842, 0.99999887, 0.99999992,
       0.99999988, 0.9999994 , 0.99999989, 0.99999264, 0.99999943,
       0.99999964, 0.99999977, 0.99999986, 0.99999977, 0.9999997 ,
       0.99999984, 0.99999989, 0.99999299, 0.99999971, 0.99963837,
       0.9999994 , 0.99999869, 0.99999965, 0.99999982, 0.99999989,
       0.99999919, 0.99999984, 0.99999993, 0.99999989, 0.99999983,
       0.99999886, 0.99999961, 0.99999898, 0.9999994 , 0.99999

In [167]:
lookalikes = {}
for customer in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer] = list(zip(similar_customers.index, similar_customers.values))

In [155]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': [str(v) for v in lookalikes.values()]
})
lookalike_df.to_csv("Vamsi_Renumala_Lookalike.csv", index=False)