In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cdist
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
merged_data = pd.read_csv('merged_datasets.csv')
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


In [4]:
def prepare_features(merged_data):
    """
    Prepare features from merged dataset
    """
    # Calculate customer-level features
    customer_features = merged_data.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'count']
    }).reset_index()
    
    customer_features.columns = ['CustomerID', 'TotalSpending', 'AverageTransactionValue', 'NumberOfTransactions']
    
    # Add region information (assuming it's customer-level)
    region_info = merged_data[['CustomerID', 'Region']].drop_duplicates()
    customer_features = customer_features.merge(region_info, on='CustomerID')
    
    return customer_features

def calculate_similarity_scores(customer_features, target_customers):
    """
    Calculate similarity scores for target customers
    """
    # Prepare features for similarity calculation
    feature_cols = ['TotalSpending', 'AverageTransactionValue', 'NumberOfTransactions']
    
    # Scale numerical features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features[feature_cols])
    
    # Add encoded region
    region_encoded = pd.get_dummies(customer_features['Region'], prefix='Region')
    scaled_features = np.hstack([scaled_features, region_encoded.values])
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(scaled_features)
    
    # Convert to DataFrame for easier lookup
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=customer_features['CustomerID'],
        columns=customer_features['CustomerID']
    )
    
    return similarity_df

def get_top_lookalikes(customer_id, similarity_df, n_recommendations=3):
    """
    Get top N similar customers for a given customer ID
    """
    # Get similarity scores for the customer
    customer_similarities = similarity_df[customer_id].sort_values(ascending=False)
    
    # Remove self and get top N
    top_similar = customer_similarities[customer_similarities.index != customer_id][:n_recommendations]
    
    return top_similar

def generate_lookalike_recommendations(merged_data):
    """
    Generate lookalike recommendations for first 20 customers
    """
    # Prepare features
    customer_features = prepare_features(merged_data)
    
    # Calculate similarity scores
    similarity_df = calculate_similarity_scores(customer_features, None)
    
    # Generate recommendations for first 20 customers
    target_customers = [f'C{str(i+1).zfill(4)}' for i in range(20)]
    recommendations = []
    
    for customer_id in target_customers:
        # Get top 3 similar customers
        top_similar = get_top_lookalikes(customer_id, similarity_df)
        
        # Add to recommendations list
        for similar_id, score in top_similar.items():
            recommendations.append({
                'CustomerID': customer_id,
                'SimilarCustomerID': similar_id,
                'SimilarityScore': score
            })
    
    # Create recommendations DataFrame
    recommendations_df = pd.DataFrame(recommendations)
    
    # Save to CSV
    recommendations_df.to_csv('Lookalike.csv', index=False)
    
    return recommendations_df

def main():
    # Assuming merged_data is your existing merged dataset
    # merged_data should have columns: CustomerID, TransactionID, TotalValue, Region
    
    # Generate recommendations
    recommendations = generate_lookalike_recommendations(merged_data)
    
    # Print sample results
    print("\nSample recommendations:")
    print(recommendations.head(10))
    
    # Print summary statistics
    print("\nSummary statistics:")
    print(f"Number of target customers: {recommendations['CustomerID'].nunique()}")
    print(f"Average similarity score: {recommendations['SimilarityScore'].mean():.3f}")
    print(f"Min similarity score: {recommendations['SimilarityScore'].min():.3f}")
    print(f"Max similarity score: {recommendations['SimilarityScore'].max():.3f}")

if __name__ == "__main__":
    main()


Sample recommendations:
  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0137         0.999762
1      C0001             C0152         0.999510
2      C0001             C0107         0.964169
3      C0002             C0043         0.978999
4      C0002             C0142         0.975898
5      C0002             C0088         0.954402
6      C0003             C0133         0.987062
7      C0003             C0052         0.975410
8      C0003             C0112         0.941578
9      C0004             C0108         0.982718

Summary statistics:
Number of target customers: 20
Average similarity score: 0.965
Min similarity score: 0.842
Max similarity score: 1.000
