In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
# Load datasets (update file paths if needed)
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [4]:
# Merge datasets for user-product transaction information
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [5]:
# Feature engineering: Aggregating transaction history by customer
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spend
    'Quantity': 'sum',   # Total quantity bought
    'ProductID': lambda x: len(set(x)),  # Number of unique products purchased
    'Category': lambda x: len(set(x)),   # Number of unique categories purchased
}).reset_index()

In [6]:
customer_features.columns = ["CustomerID", "TotalSpend", "TotalQuantity", "UniqueProducts", "UniqueCategories"]

In [7]:
# Normalize features for similarity calculation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

In [8]:
# Compute cosine similarity between all customers
similarity_matrix = cosine_similarity(scaled_features)

In [9]:
# Function to find top 3 similar customers for a given customer ID
def get_top_3_similar(customers_df, similarity_matrix, customer_id):
    customer_index = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_index]
    similar_indices = similarity_scores.argsort()[::-1][1:4]  # Exclude self and pick top 3
    similar_customers = customers_df.iloc[similar_indices][['CustomerID']]
    similar_customers['Score'] = similarity_scores[similar_indices]
    return similar_customers

In [10]:
# Generate lookalike results for the first 20 customers
lookalike_results = {}
for customer_id in customers['CustomerID'][:20]:
    top_3 = get_top_3_similar(customer_features, similarity_matrix, customer_id)
    lookalike_results[customer_id] = top_3.values.tolist()

In [11]:
# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": k, "Lookalikes": v} for k, v in lookalike_results.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed. Results saved to Lookalike.csv.")

Lookalike model completed. Results saved to Lookalike.csv.
