In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
customers = pd.read_csv(r"C:\SNU\Sem 7\z_assignment\Customers.csv")
transactions = pd.read_csv(r"C:\SNU\Sem 7\z_assignment\Transactions.csv")
products = pd.read_csv(r"C:\SNU\Sem 7\z_assignment\products.csv")

In [3]:
# Rename columns for clarity
products.rename(columns={"Price": "CostPrice"}, inplace=True)
transactions.rename(columns={"Price": "SellingPrice"}, inplace=True)

# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID", how="inner").merge(products, on="ProductID", how="inner")

In [4]:
# Group by customer to aggregate numerical and categorical features
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: list(x),  # Collect all categories purchased
    'Region': 'first'  # Take the first region (assumes one region per customer)
}).reset_index()

In [5]:
# Step 1: Normalize Numeric Features
numeric_features = ['TotalValue', 'Quantity']
scaler = StandardScaler()
normalized_numeric = scaler.fit_transform(customer_features[numeric_features])

# Compute Canberra Distance for normalized numeric data
canberra_distance = pairwise_distances(normalized_numeric, metric='canberra')

# Step 3: Combine Canberra Distance and Jaccard Similarity
# Normalize Canberra Distance to range [0, 1]
normalized_canberra = 1 - (canberra_distance / np.max(canberra_distance))

In [6]:
# Process Categorical Features
categories = list(set([cat for sublist in customer_features['Category'] for cat in sublist]))
for category in categories:
    customer_features[category] = customer_features['Category'].apply(lambda x: 1 if category in x else 0)

binary_matrix = customer_features[categories]

# Convert to NumPy array and compute Jaccard Similarity
binary_matrix_np = binary_matrix.to_numpy().astype(bool)
jaccard_similarity = 1 - pairwise_distances(binary_matrix_np, metric='jaccard')

In [7]:
# Combine both metrics with weights
w1, w2 = 0.5, 0.5  # Adjust weights as needed
final_similarity = w1 * normalized_canberra + w2 * jaccard_similarity

# Step 4: Recommend Top 3 Lookalikes for Each Customer
customer_ids = customer_features['CustomerID'].tolist()
lookalike_results = {}

for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers (C0001 - C0020)
    similarities = list(enumerate(final_similarity[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[i], round(score, 3)) for i, score in similarities[1:4]]  # Exclude self
    lookalike_results[customer_id] = top_3

# Step 5: Create Lookalike CSV
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Lookalikes': [str(value) for value in lookalike_results.values()]
})

In [9]:
lookalike_df.to_csv('Vaishnavi_Bhuvanagiri_Lookalike.csv', index=False)
print("Vaishnavi_Bhuvanagiri_Lookalike.csv has been created successfully!")

Vaishnavi_Bhuvanagiri_Lookalike.csv has been created successfully!
