# 1. Data Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load datasets
customers_df = pd.read_csv('C:/Users/Tashm/Datasets/Zeotap_DS_Assignment/Customers.csv')
products_df = pd.read_csv('C:/Users/Tashm/Datasets/Zeotap_DS_Assignment/Products.csv')
transactions_df = pd.read_csv('C:/Users/Tashm/Datasets/Zeotap_DS_Assignment/Transactions.csv')

In [3]:
# Aggregate transaction data
transaction_agg = transactions_df.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    UniqueProducts=('ProductID', 'nunique')
).reset_index()

# Merge with customer data
customer_data = customers_df.merge(transaction_agg, on='CustomerID', how='left').fillna(0)

# 2. Feature Engineering

In [4]:
# Select features for similarity calculation
customer_data_encoded = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)
features = ['TotalSpending', 'TotalQuantity', 'UniqueProducts'] + [col for col in customer_data_encoded.columns if 'Region_' in col]

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data_encoded[features])


# 3. Compute Similarity

In [5]:
# Compute pairwise similarity
similarity_matrix = cosine_similarity(scaled_features)

# Map similarity scores to customer IDs
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])


# 4. Generate Lookalikes

In [6]:
lookalike_map = {}

# Find top 3 similar customers for each in C0001–C0020
for customer_id in customer_data['CustomerID'][:20]:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:4]
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Create Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)
