In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [2]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [3]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [4]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [5]:
# Merge datasets for analysis
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [6]:
# Feature Engineering
# 1. Total spend per customer
customer_spend = merged_data.groupby("CustomerID")["TotalValue"].sum().rename("TotalSpend")

# 2. Average transaction value per customer
avg_transaction_value = merged_data.groupby("CustomerID")["TotalValue"].mean().rename("AvgTransactionValue")

# 3. Purchase frequency (total transactions per customer)
purchase_frequency = merged_data.groupby("CustomerID").size().rename("PurchaseFrequency")


In [7]:
# 4. Preferred product category (most purchased category for each customer)
def preferred_category(group):
    return group.groupby("Category")["Quantity"].sum().idxmax()

In [8]:
preferred_category = merged_data.groupby("CustomerID").apply(preferred_category).rename("PreferredCategory")


In [9]:
# Combine all features into a single DataFrame
customer_features = pd.concat([customer_spend, avg_transaction_value, purchase_frequency, preferred_category], axis=1)

# One-hot encode the preferred category
customer_features = pd.get_dummies(customer_features, columns=["PreferredCategory"], drop_first=True)


In [10]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ["TotalSpend", "AvgTransactionValue", "PurchaseFrequency"]
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [11]:
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(customer_features)

# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)



In [12]:
# Function to get top 3 similar customers for a given customer
def get_top_3_similar(customers, similarity_df):
    lookalike_map = {}
    for customer_id in customers:
        similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Exclude self
        lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    return lookalike_map

In [13]:
# Get top 3 lookalikes for customers C0001 to C0020
customer_ids = customers.loc[customers["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)]), "CustomerID"]
lookalike_map = get_top_3_similar(customer_ids, similarity_df)

In [14]:
# Save to Lookalike.csv
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust_id, score in lookalikes:
        lookalike_data.append({"CustomerID": cust_id, "SimilarCustomerID": similar_cust_id, "Score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [15]:
# Output Lookalike Map
print("Lookalike Map:")
for cust_id, lookalikes in lookalike_map.items():
    print(f"{cust_id}: {lookalikes}")


Lookalike Map:
C0001: [('C0072', 0.9461051769363842), ('C0190', 0.9415577271671418), ('C0069', 0.9105496531690128)]
C0002: [('C0199', 0.9982365684360944), ('C0025', 0.9934620809563768), ('C0071', 0.973715079531913)]
C0003: [('C0178', 0.999865879448906), ('C0133', 0.98706169870118), ('C0052', 0.9754103448062152)]
C0004: [('C0108', 0.9827182639674003), ('C0113', 0.9785391786723804), ('C0012', 0.9654988139733864)]
C0005: [('C0197', 0.9953167594730895), ('C0130', 0.9502003454619035), ('C0035', 0.9500727303343954)]
C0006: [('C0117', 0.9984084372142995), ('C0185', 0.9538881367763729), ('C0011', 0.9367227677406286)]
C0007: [('C0085', 0.9998669294762257), ('C0120', 0.9949897643552797), ('C0140', 0.9798101409546568)]
C0008: [('C0109', 0.9698722815670315), ('C0093', 0.9365582499229672), ('C0098', 0.9307192939474349)]
C0009: [('C0077', 0.9997899920904881), ('C0032', 0.9974206107805481), ('C0033', 0.9817031601343669)]
C0010: [('C0029', 0.99957123050574), ('C0031', 0.9904300201174321), ('C0009', 0.