In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers=pd.read_csv(r"C:\Users\MY\Downloads\Customers.csv")
products=pd.read_csv(r"C:\Users\MY\Downloads\Products.csv")
transaction=pd.read_csv(r"C:\Users\MY\Downloads\Transactions.csv")

In [3]:
merged_data = transaction.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")


In [5]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


In [None]:
# Aggregating customer data
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",   # Total spending
    "Quantity": "sum",     # Total quantity purchased
    "Price_y": "mean"         # Average price of purchased products
}).reset_index()

In [13]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,Price_y
0,C0001,3354.52,12,278.334000
1,C0002,1862.74,10,208.920000
2,C0003,2725.38,14,195.707500
3,C0004,5354.88,23,240.636250
4,C0005,2034.24,7,291.603333
...,...,...,...,...
194,C0196,4982.88,12,416.992500
195,C0197,1928.65,9,227.056667
196,C0198,931.83,3,239.705000
197,C0199,1979.28,9,250.610000


In [7]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop("CustomerID", axis=1))

In [8]:
similarity_matrix = cosine_similarity(scaled_features)

In [16]:
similarity_matrix.shape

(199, 199)

In [9]:
# Create recommendations for the first 20 customers
customer_ids = customer_features["CustomerID"].tolist()
recommendations = {}

In [10]:
for idx, customer_id in enumerate(customer_ids[:20]):
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]  # Get top 3 similar customers
    recommendations[customer_id] = [(customer_ids[i], similarity_matrix[idx][i]) for i in similar_indices]


In [12]:
# Create Lookalike.csv
lookalike_data = []
for cust_id, similar_customers in recommendations.items():
    for sim_cust_id, score in similar_customers:
        lookalike_data.append([cust_id, sim_cust_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"])
lookalike_df.to_csv("Umang_Agarwal_Lookalike.csv", index=False)

# Display Lookalike recommendations
print(lookalike_df.head(60))


   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0103         0.997573
1       C0001             C0092         0.996879
2       C0001             C0135         0.992736
3       C0002             C0029         0.999854
4       C0002             C0077         0.996104
5       C0002             C0157         0.995478
6       C0003             C0111         0.998487
7       C0003             C0190         0.996656
8       C0003             C0038         0.990133
9       C0004             C0165         0.998390
10      C0004             C0162         0.998087
11      C0004             C0075         0.996932
12      C0005             C0167         0.999972
13      C0005             C0020         0.999714
14      C0005             C0128         0.998762
15      C0006             C0168         0.997612
16      C0006             C0196         0.995025
17      C0006             C0187         0.994752
18      C0007             C0125         0.999849
19      C0007       