In [75]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load data

In [78]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Processing customer data

In [79]:
def process_customers(customers):
    # Calculate customer tenure in days
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    today = datetime.today()
    customers['TenureDays'] = (today - customers['SignupDate']).dt.days

    # One-hot-encoding the region
    customers = pd.get_dummies(customers, columns=['Region'], drop_first=True)
    return customers

customers_df = process_customers(customers_df)

# Aggregating Transactions Data

In [80]:
def aggregate_transactions(transactions, products):
    # Merge transactions with product categories
    transactions = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID', how='left')

    # Total spend per customer
    spend = transactions.groupby('CustomerID')['TotalValue'].sum().rename('TotalSpend')

    # Total transaction count per customer
    frequency = transactions.groupby('CustomerID')['TransactionID'].count().rename('TransactionCount')

    # Spend breakdown by category
    category_spend = transactions.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)

    # Combine all transaction-based features
    return pd.concat([spend, frequency, category_spend], axis=1).reset_index()

transaction_features = aggregate_transactions(transactions_df, products_df)

# Merging Customer and Transaction Data

In [81]:
def merge_features(customers, transactions):
    # Merge customer and transaction features
    features = pd.merge(customers, transactions, on='CustomerID', how='left')
    features.fillna(0, inplace=True)  # Handle missing transaction data
    return features

customer_features = merge_features(customers_df, transaction_features)

# Normalize Features for Similarity Calculation

In [82]:
def normalize_features(features):
    # Drop non-numerical or identifier columns for normalization
    numeric_data = features.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])
    scaler = StandardScaler()
    normalized = scaler.fit_transform(numeric_data)
    return pd.DataFrame(normalized, index=features['CustomerID'], columns=numeric_data.columns)

normalized_features = normalize_features(customer_features)

# Computing Similarity

In [83]:
def compute_similarity(normalized_features):
    # Cosine similarity between customers
    similarity_matrix = cosine_similarity(normalized_features)
    return pd.DataFrame(similarity_matrix, index=normalized_features.index, columns=normalized_features.index)

similarity_df = compute_similarity(normalized_features)

# Extracting Top 3 Lookalikes and Saving Lookalikes to csv file

In [90]:
def get_top_lookalikes(similarity_df, top_n=3):
    lookalike_map = {}
    for customer_id in similarity_df.index:
        # Get top-N most similar customers (exclude the customer itself)
        similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
        lookalike_map[customer_id] = list(similar_customers.items())
    return lookalike_map

lookalike_map = get_top_lookalikes(similarity_df)
save_lookalikes(lookalike_map)

Lookalike recommendations saved to Lookalike.csv.


In [85]:
def save_lookalikes(lookalike_map, output_file='Lookalike.csv'):
    # Convert map to a DataFrame and save
    lookalike_df = pd.DataFrame([
        {'cust_id': k, 'lookalikes': v} for k, v in lookalike_map.items()
    ])
    lookalike_df.to_csv(output_file, index=False)
    print(f"Lookalike recommendations saved to {output_file}.")

# Generating Insights

In [87]:
def generate_insights(features):
    # Find top spenders
    top_spenders = features[['CustomerID', 'TotalSpend']].sort_values(by='TotalSpend', ascending=False).head(5)
    print("\nTop 5 Customers by Spend:")
    print(top_spenders)

    # Find most popular categories
    category_columns = [col for col in features.columns if col not in ['CustomerID', 'CustomerName', 'SignupDate', 'TenureDays', 'TotalSpend', 'TransactionCount']]
    popular_categories = features[category_columns].mean().sort_values(ascending=False)
    print("\nMost Popular Categories by Average Spend:")
    print(popular_categories)

In [88]:
generate_insights(customer_features)


Top 5 Customers by Spend:
    CustomerID  TotalSpend
140      C0141    10673.87
53       C0054     8040.39
64       C0065     7663.70
155      C0156     7634.45
81       C0082     7572.91

Most Popular Categories by Average Spend:
Books                   960.73735
Electronics             903.91750
Clothing                830.85330
Home Decor              754.46965
Region_South America      0.29500
Region_Europe             0.25000
Region_North America      0.23000
dtype: float64
