In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import csv
from datetime import datetime

# Read data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Create enriched transaction data
enriched_transactions = transactions.merge(products, on='ProductID')

# Feature Engineering
# 1. Basic customer features
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# 2. Customer purchase metrics
customer_metrics = transactions.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    AvgOrderValue=('TotalValue', 'mean'),
    TransactionCount=('TransactionID', 'count'),
    TotalQuantity=('Quantity', 'sum'),
    AvgQuantityPerOrder=('Quantity', 'mean'),
    LastPurchaseDate=('TransactionDate', 'max'),
    FirstPurchaseDate=('TransactionDate', 'min')
).reset_index()


# 3. Category preferences (fix percentages)
category_pivot = enriched_transactions.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)
category_percentages = category_pivot.div(category_pivot.sum(axis=1), axis=0) * 100  # Convert to actual percentages

# 4. Time-based features (fix negative values)
reference_date = pd.Timestamp('2024-01-01')
customer_metrics['DaysSinceLastPurchase'] = (reference_date - customer_metrics['LastPurchaseDate']).dt.days
customer_metrics['AccountAge'] = (reference_date - customers['SignupDate']).dt.days
customer_metrics['PurchaseFrequencyDays'] = ((customer_metrics['LastPurchaseDate'] -
                                            customer_metrics['FirstPurchaseDate']).dt.days /
                                           customer_metrics['TransactionCount']).clip(lower=0)

# 5. Create final customer features
customer_features = customers.merge(customer_metrics, on='CustomerID')
customer_features = customer_features.merge(category_percentages, on='CustomerID', how='left')

# Fill NaN values in category percentages with 0
customer_features = customer_features.fillna(0)

# One-hot encode region
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Select features for similarity calculation
feature_columns = [
    'TotalSpent', 'AvgOrderValue', 'TransactionCount', 'TotalQuantity',
    'AvgQuantityPerOrder', 'DaysSinceLastPurchase', 'AccountAge',
    'PurchaseFrequencyDays'
] + list(category_percentages.columns) + [col for col in customer_features.columns if col.startswith('Region_')]

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[feature_columns])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Generate recommendations
lookalikes = {}
for idx in range(len(customer_features)):
    target_id = customer_features.iloc[idx]['CustomerID']
    sim_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score and exclude self (index 0)
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes[target_id] = [
        (customer_features.iloc[i[0]]['CustomerID'], round(float(i[1]), 2))
        for i in sorted_scores
    ]

# Export to CSV
with open('Lookalike.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])
    for cust, sim in lookalikes.items():
        row = [cust]
        for s in sim:
            row.extend([s[0], s[1]])
        writer.writerow(row)

# Print feature list for reference
print("Features used in similarity calculation:")
for feature in feature_columns:
    print(f"- {feature}")


Features used in similarity calculation:
- TotalSpent
- AvgOrderValue
- TransactionCount
- TotalQuantity
- AvgQuantityPerOrder
- DaysSinceLastPurchase
- AccountAge
- PurchaseFrequencyDays
- Books
- Clothing
- Electronics
- Home Decor
- Region_Asia
- Region_Europe
- Region_North America
- Region_South America
