In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

def load_data():
    """Load and preprocess the datasets."""
    customers_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Customers.csv")
    products_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Products.csv")
    transactions_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Transactions.csv")
    
    # Convert date columns to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    return customers_df, products_df, transactions_df

def create_customer_features(customers_df, transactions_df):
    """Create feature matrix for customers."""
    # Aggregate transactions by customer
    customer_transactions = transactions_df.groupby('CustomerID').agg(
        TransactionCount=('TransactionID', 'count'),
        TotalValueSum=('TotalValue', 'sum'),
        TotalValueMean=('TotalValue', 'mean'),
        QuantitySum=('Quantity', 'sum'),
        QuantityMean=('Quantity', 'mean'),
        FirstTransaction=('TransactionDate', 'min'),
        LastTransaction=('TransactionDate', 'max')
    ).reset_index()
    
    # Calculate recency
    current_date = transactions_df['TransactionDate'].max()
    customer_transactions['Recency'] = (current_date - customer_transactions['LastTransaction']).dt.days
    
    # Merge with customer information
    customer_features = customers_df.merge(
        customer_transactions,
        on='CustomerID',
        how='left'
    )
    
    # Handle missing values (fill NaN with 0)
    customer_features.fillna(0, inplace=True)
    
    # One-hot encode region
    if 'Region' in customer_features.columns:
        region_dummies = pd.get_dummies(customer_features['Region'], prefix='Region')
        customer_features = pd.concat([customer_features, region_dummies], axis=1)
    
    # Drop non-numeric columns
    customer_features.drop(columns=['CustomerName', 'SignupDate', 'FirstTransaction', 'LastTransaction', 'Region'], errors='ignore', inplace=True)
    
    return customer_features

def find_similar_customers(customer_features, target_customer_id):
    """Find similar customers based on features."""
    # Select only numerical features
    feature_columns = [col for col in customer_features.columns if col != 'CustomerID']
    
    # Normalize features
    scaler = StandardScaler()
    features_normalized = scaler.fit_transform(customer_features[feature_columns])
    
    # Find target customer index
    target_idx = customer_features[customer_features['CustomerID'] == target_customer_id].index[0]
    
    # Calculate distances
    distances = cdist([features_normalized[target_idx]], features_normalized)[0]
    
    # Get top 3 similar customers
    similar_indices = np.argsort(distances)[1:4]  # Exclude self
    
    similar_customers = []
    for idx in similar_indices:
        customer_id = customer_features.iloc[idx]['CustomerID']
        similarity_score = 1 / (1 + distances[idx])  # Convert distance to similarity score
        similar_customers.append((customer_id, round(similarity_score, 3)))
    
    return similar_customers

def main():
    # Load data
    customers_df, products_df, transactions_df = load_data()
    
    # Create customer features
    customer_features = create_customer_features(customers_df, transactions_df)
    
    # Generate lookalikes for customers C0001-C0020
    results = []
    for cust_id in customers_df[customers_df['CustomerID'].between('C0001', 'C0020')]['CustomerID']:
        similar_customers = find_similar_customers(customer_features, cust_id)
        results.append({
            'CustomerID': cust_id,
            'Lookalikes': str(similar_customers)
        })
    
    # Save results
    lookalike_df = pd.DataFrame(results)
    lookalike_df.to_csv('Lookalike_Customers.csv', index=False)

if __name__ == "__main__":
    main()
