In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load your data (adjust paths accordingly)
customers = pd.read_csv('Project/Customers.csv')
products = pd.read_csv('Project/Products.csv')
transactions = pd.read_csv('Project/Transactions.csv')

In [3]:
# Aggregating transaction data for each customer (already done earlier)
customer_transactions = transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'nunique'),
    avg_quantity=('Quantity', 'mean')
).reset_index()


In [4]:
# Merge the aggregated data with customer demographic data
customer_data = customers[['CustomerID', 'Region']]
customer_data = customer_data.merge(customer_transactions, on='CustomerID', how='left')

In [5]:
# Encode categorical columns (Region) using one-hot encoding
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)


In [6]:
# Select only numeric columns for scaling
numeric_columns = customer_data.select_dtypes(include=[np.number]).columns
customer_data[numeric_columns] = customer_data[numeric_columns].fillna(0)  # Handle missing values if any

In [7]:
# Standardizing the data (scaling)
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data[numeric_columns])

In [8]:
# Now compute cosine similarity between customers
cosine_sim = cosine_similarity(customer_data_scaled)

In [9]:
# Check the shape to verify
print(cosine_sim.shape)

(200, 200)


In [10]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

In [11]:
lookalikes = {}

In [12]:
for customer_id in customer_data['CustomerID']:
    # Get the similarity scores for this customer
    similarity_scores = cosine_sim_df.loc[customer_id]
    
    # Exclude the customer themselves by setting their score to 0
    similarity_scores = similarity_scores.drop(customer_id)
    
    # Sort the customers by similarity score in descending order and get the top 3
    top_3_customers = similarity_scores.nlargest(3)
    
    # Store the top 3 customers and their similarity scores
    lookalikes[customer_id] = list(zip(top_3_customers.index, top_3_customers.values))

In [13]:
for customer_id in list(lookalikes.keys())[:5]:
    print(f"Customer {customer_id} Lookalikes: {lookalikes[customer_id]}")

Customer C0001 Lookalikes: [('C0103', 0.9997508221712416), ('C0164', 0.9994623722507264), ('C0069', 0.9963114139971576)]
Customer C0002 Lookalikes: [('C0029', 0.9998159465033968), ('C0031', 0.9992104284992624), ('C0077', 0.9847869854010873)]
Customer C0003 Lookalikes: [('C0176', 0.996972996652657), ('C0070', 0.9720529437012835), ('C0144', 0.9632379124271557)]
Customer C0004 Lookalikes: [('C0075', 0.9962137508984765), ('C0113', 0.9851817872638582), ('C0012', 0.9826629066609655)]
Customer C0005 Lookalikes: [('C0131', 0.9995372589255568), ('C0186', 0.9961995386096014), ('C0150', 0.9954743267751809)]


In [14]:
lookalike_list = []
for customer_id, similar_customers in lookalikes.items():
    for similar_customer, score in similar_customers:
        lookalike_list.append([customer_id, similar_customer, score])

In [15]:
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

In [16]:
lookalike_df.to_csv('Lookalike.csv', index=False)