In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv("Products.csv")
transactions = pd.read_csv('Transactions.csv')

In [3]:
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID', suffixes=('_txn', '_product'))

Customer Related Features

In [4]:
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['Tenure'] = (datetime.now() - data['SignupDate']).dt.days

data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])
last_transaction = data.groupby('CustomerID')['TransactionDate'].max().reset_index()
last_transaction.columns = ['CustomerID', 'LastTransactionDate']
last_transaction['DaysSinceLastTransaction'] = (datetime.now() - last_transaction['LastTransactionDate']).dt.days

Transaction Related Features - Transaction average, transaction frequency

In [5]:
transaction_behavior = data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  
    'TransactionID': 'count',       
}).reset_index()
transaction_behavior.columns = [
    'CustomerID', 'TotalSpending', 'AvgTransactionValue', 'TransactionFrequency'
]

Product Related Features - Favorite product category, total spent on favorite product category

In [6]:
favorite_category = data.groupby(['CustomerID', 'Category'])['TotalValue'].sum().reset_index()
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['TotalValue'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category', 'TotalValue']]
favorite_category.columns = ['CustomerID', 'FavoriteCategory', 'TotalSpentInFavoriteCategory']

Unique product categories purchased

In [7]:
unique_categories = data.groupby('CustomerID')['Category'].nunique().reset_index()
unique_categories.columns = ['CustomerID', 'UniqueCategories']

Favorite day for transaction - Monday - Sunday (0 - 6)

In [8]:
data['TransactionDay'] = data['TransactionDate'].dt.dayofweek  # 0=Monday, 6=Sunday
data['IsWeekend'] = data['TransactionDay'].apply(lambda x: 1 if x >= 5 else 0)  # 5=Saturday, 6=Sunday
favorite_day = data.groupby('CustomerID')['IsWeekend'].agg(lambda x: x.mode()[0]).reset_index()
favorite_day.columns = ['CustomerID', 'IsWeekend']

In [9]:
features = pd.merge(transaction_behavior, last_transaction, on='CustomerID')
features = pd.merge(features, favorite_category, on='CustomerID')
features = pd.merge(features, unique_categories, on='CustomerID')
features = pd.merge(features, favorite_day, on='CustomerID')

features = pd.merge(features, customers[['CustomerID', 'Region']], on='CustomerID')
features = pd.get_dummies(features, columns=['Region'], drop_first=True)

features

Unnamed: 0,CustomerID,TotalSpending,AvgTransactionValue,TransactionFrequency,LastTransactionDate,DaysSinceLastTransaction,FavoriteCategory,TotalSpentInFavoriteCategory,UniqueCategories,IsWeekend,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,670.904000,5,2024-11-02 17:04:16,86,Electronics,2827.30,3,0,False,False,True
1,C0002,1862.74,465.685000,4,2024-12-03 01:41:41,55,Clothing,1025.46,2,0,False,False,False
2,C0003,2725.38,681.345000,4,2024-08-24 18:54:04,156,Electronics,1385.20,3,1,False,False,True
3,C0004,5354.88,669.360000,8,2024-12-23 14:13:52,35,Home Decor,2110.66,3,0,False,False,True
4,C0005,2034.24,678.080000,3,2024-11-04 00:30:22,84,Electronics,1180.38,2,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,1245.720000,4,2024-12-15 03:43:35,43,Home Decor,2086.85,3,0,True,False,False
195,C0197,1928.65,642.883333,3,2024-12-27 18:20:31,31,Home Decor,1013.73,2,0,True,False,False
196,C0198,931.83,465.915000,2,2024-10-04 18:31:12,115,Clothing,904.84,2,0,True,False,False
197,C0199,1979.28,494.820000,4,2024-10-26 00:01:58,93,Home Decor,1384.90,2,1,True,False,False


In [10]:
features.columns

Index(['CustomerID', 'TotalSpending', 'AvgTransactionValue',
       'TransactionFrequency', 'LastTransactionDate',
       'DaysSinceLastTransaction', 'FavoriteCategory',
       'TotalSpentInFavoriteCategory', 'UniqueCategories', 'IsWeekend',
       'Region_Europe', 'Region_North America', 'Region_South America'],
      dtype='object')

The model is based on the simple cosine similarity where it calculates a similarity matrix for each customer and every other customer. And returns the top 3 similar customers based on the similarity scores.

Euclidian distance or Jaccard similarity can also be used instead of cosine similarity but cosine similarity is more easier to interpret and is also a good metric as it better handles high dimensional data and also is intuitive for directional similarity.

In [13]:
from sklearn.impute import SimpleImputer

numeric_cols = features.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numeric_cols = features.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
features[numeric_cols] = imp.fit_transform(features[numeric_cols])

features = features[numeric_cols + non_numeric_cols]

scaler = StandardScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])

similarity_matrix = cosine_similarity(features[numeric_cols])

lookalike_map = {}
for i in range(20):
    cust_id = features.iloc[i]['CustomerID']
    sim_scores = list(enumerate(similarity_matrix[i]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    lookalike_map[cust_id] = [
        (features.iloc[j]['CustomerID'], round(score, 3)) 
        for j, score in sim_scores
    ]

lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')
lookalike_df.to_csv('Lookalike.csv', header=False)

In [12]:
similarity_matrix

array([[ 1.        , -0.28519355, -0.53204477, ..., -0.32925152,
        -0.4506261 ,  0.33930039],
       [-0.28519355,  1.        , -0.08629702, ...,  0.86007967,
         0.33892806, -0.56875871],
       [-0.53204477, -0.08629702,  1.        , ...,  0.10890046,
         0.85123942, -0.32511515],
       ...,
       [-0.32925152,  0.86007967,  0.10890046, ...,  1.        ,
         0.36154515, -0.64828193],
       [-0.4506261 ,  0.33892806,  0.85123942, ...,  0.36154515,
         1.        , -0.5755085 ],
       [ 0.33930039, -0.56875871, -0.32511515, ..., -0.64828193,
        -0.5755085 ,  1.        ]], shape=(199, 199))