In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

# Feature engineering: Aggregating customer-level data
customer_features = data.groupby("CustomerID").agg({
    'TotalValue': 'sum',                     # Total spend
    'ProductID': 'nunique',                  # Unique products purchased
    'Category': lambda x: x.mode()[0],       # Most common category
    'Region': 'first'                        # Region of the customer
}).reset_index()





In [None]:
# Encoding categorical variables (Region and Category)
encoder = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['Region', 'Category'])
], remainder='passthrough')

# Standardizing numerical features
scaler = StandardScaler()

pipeline = Pipeline([
    ('encoder', encoder),
    ('scaler', scaler)
])

customer_data = pipeline.fit_transform(customer_features[['Region', 'Category', 'TotalValue', 'ProductID']])

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_data)

In [None]:

# Extract top 3 lookalikes for each customer
customer_ids = customer_features['CustomerID'].tolist()
lookalikes = {}

for idx, customer_id in enumerate(customer_ids[:20]):  # Limit to first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Skip self-comparison and take top 3 lookalikes
    top_lookalikes = [
        (customer_ids[sim[0]], sim[1]) for sim in similarity_scores[1:4]
    ]
    lookalikes[customer_id] = top_lookalikes



In [None]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()),
    'Lookalikes': [
        [{'cust_id': lk[0], 'score': lk[1]} for lk in lookalikes[cust]]
        for cust in lookalikes
    ]
})

lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv created successfully!")

Lookalike.csv created successfully!
