In [1]:
import pandas as pd

customers = pd.read_csv('Customers.csv')
products= pd.read_csv('Products.csv')
transactions  = pd.read_csv('Transactions.csv')

In [2]:
print(customers.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15


In [3]:
print(transactions.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  


In [4]:
print(products.head())

  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


In [5]:
transactions.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')

In [6]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

customers['Tenure'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days

In [7]:
customer_summary = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  
    'TransactionID': 'count',  
    'ProductID': 'nunique'  
}).reset_index()

customer_summary.columns = ['CustomerID', 'TotalSpend', 'TransactionCount', 'UniqueProducts']
customer_summary = pd.merge(customers, customer_summary, on='CustomerID', how='inner')

print(customer_summary.head())

  CustomerID        CustomerName         Region SignupDate  Tenure  \
0      C0001    Lawrence Carroll  South America 2022-07-10     932   
1      C0002      Elizabeth Lutz           Asia 2022-02-13    1079   
2      C0003      Michael Rivera  South America 2024-03-07     326   
3      C0004  Kathleen Rodriguez  South America 2022-10-09     841   
4      C0005         Laura Weber           Asia 2022-08-15     896   

   TotalSpend  TransactionCount  UniqueProducts  
0     3354.52                 5               5  
1     1862.74                 4               4  
2     2725.38                 4               4  
3     5354.88                 8               8  
4     2034.24                 3               3  


In [8]:
from itertools import combinations

def generate_customer_pairs(customer_summary):
    customer_pairs = []
    for (id1, id2) in combinations(customer_summary['CustomerID'], 2):
        customer1 = customer_summary[customer_summary['CustomerID'] == id1].iloc[0]
        customer2 = customer_summary[customer_summary['CustomerID'] == id2].iloc[0]
        features = {
            'tenure_diff': abs(customer1['Tenure'] - customer2['Tenure']),
            'spend_diff': abs(customer1['TotalSpend'] - customer2['TotalSpend']),
            'transaction_diff': abs(customer1['TransactionCount'] - customer2['TransactionCount']),
            'unique_products_diff': abs(customer1['UniqueProducts'] - customer2['UniqueProducts']),
        }
        customer_pairs.append((id1, id2, features))
    return customer_pairs

pair_features = generate_customer_pairs(customer_summary)
print(pair_features[:5])

[('C0001', 'C0002', {'tenure_diff': 147, 'spend_diff': 1491.7800000000004, 'transaction_diff': 1, 'unique_products_diff': 1}), ('C0001', 'C0003', {'tenure_diff': 606, 'spend_diff': 629.1400000000003, 'transaction_diff': 1, 'unique_products_diff': 1}), ('C0001', 'C0004', {'tenure_diff': 91, 'spend_diff': 2000.3599999999997, 'transaction_diff': 3, 'unique_products_diff': 3}), ('C0001', 'C0005', {'tenure_diff': 36, 'spend_diff': 1320.2800000000004, 'transaction_diff': 2, 'unique_products_diff': 2}), ('C0001', 'C0006', {'tenure_diff': 546, 'spend_diff': 873.0499999999993, 'transaction_diff': 1, 'unique_products_diff': 1})]


In [9]:
def flatten_features(pair_features):
    data = []
    for id1, id2, features in pair_features:
        row = {'Customer1': id1, 'Customer2': id2}
        row.update(features) 
        row['SimilarityScore'] = 1 / (1 + sum(features.values()))
        data.append(row)
    return pd.DataFrame(data)

pairwise_dataset = flatten_features(pair_features)
print(pairwise_dataset.head())


  Customer1 Customer2  tenure_diff  spend_diff  transaction_diff  \
0     C0001     C0002          147     1491.78                 1   
1     C0001     C0003          606      629.14                 1   
2     C0001     C0004           91     2000.36                 3   
3     C0001     C0005           36     1320.28                 2   
4     C0001     C0006          546      873.05                 1   

   unique_products_diff  SimilarityScore  
0                     1         0.000609  
1                     1         0.000808  
2                     3         0.000477  
3                     2         0.000735  
4                     1         0.000703  


In [10]:
from sklearn.model_selection import train_test_split

X = pairwise_dataset.drop(columns=['Customer1', 'Customer2', 'SimilarityScore'])
y = pairwise_dataset['SimilarityScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [13]:
def recommend_similar_customers(customer_id, customer_summary, model, top_n=3):
    recommendations = []
    for other_customer_id in customer_summary['CustomerID']:
        if customer_id != other_customer_id:
            customer1 = customer_summary[customer_summary['CustomerID'] == customer_id].iloc[0]
            customer2 = customer_summary[customer_summary['CustomerID'] == other_customer_id].iloc[0]
            features = {
                'tenure_diff': abs(customer1['Tenure'] - customer2['Tenure']),
                'spend_diff': abs(customer1['TotalSpend'] - customer2['TotalSpend']),
                'transaction_diff': abs(customer1['TransactionCount'] - customer2['TransactionCount']),
                'unique_products_diff': abs(customer1['UniqueProducts'] - customer2['UniqueProducts']),
            }
            score = model.predict([list(features.values())])[0]
            recommendations.append((other_customer_id, score))
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations[:top_n]

recommendations = {}
for customer_id in customer_summary['CustomerID'][:20]:
    recommendations[customer_id] = recommend_similar_customers(customer_id, customer_summary, model)

    recommendations_df = pd.DataFrame([
    {'CustomerID': k, 'Recommendations': v} for k, v in recommendations.items()
])
recommendations_df.to_csv('Lookalike.csv', index=False)

print("Recommendations saved to 'Lookalike.csv'")


Recommendations saved to 'Lookalike.csv'
