In [13]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Load datasets
customers = pd.read_csv('DataFile/Customers.csv', parse_dates=['SignupDate'])
products = pd.read_csv('DataFile/Products.csv')
transactions = pd.read_csv('DataFile/Transactions.csv', parse_dates=['TransactionDate'])
print("dataset succefully loaded")

dataset succefully loaded


In [14]:
transactions_with_products = pd.merge(transactions, products, on='ProductID', how='inner')
merged_data = pd.merge(customers, transactions_with_products, on='CustomerID', how='inner')
print("dataset succefully merged")

dataset succefully merged


In [15]:
# Filter for the first 20 customers
target_customers = customers[customers['CustomerID'].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)])]

# Prepare feature set by merging datasets
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  
    'TransactionID': 'count', 
    'Category': lambda x: ','.join(x),  
    'Region': 'first', 
    'SignupDate': 'first'
}).reset_index()

encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customer_features[['Region']]).toarray()

customer_features['SignupDate'] = (customer_features['SignupDate'] - customer_features['SignupDate'].min()).dt.days

categories = customer_features['Category'].str.get_dummies(sep=',')
customer_features = customer_features.drop('Category', axis=1)
feature_matrix = np.hstack([
    customer_features[['TotalValue', 'TransactionID', 'SignupDate']].values, 
    region_encoded, 
    categories.values  
])

# Scale feature matrix
scaler = MinMaxScaler()
feature_matrix_scaled = scaler.fit_transform(feature_matrix)

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix_scaled)

# recommendations for the first 20 customers
result = {}
for idx, customer_id in enumerate(target_customers['CustomerID']):
    scores = similarity_matrix[idx]
    similar_customers = sorted(
        [(other_id, score) for other_id, score in zip(customer_features['CustomerID'], scores) if other_id != customer_id],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    result[customer_id] = similar_customers
print("Generate recommendations for the first 20 customers")

Generate recommendations for the first 20 customers


In [16]:
result = {}
for idx, customer_id in enumerate(target_customers['CustomerID']):

    scores = similarity_matrix[idx]
    similar_customers = sorted(
        [(other_id, score) for other_id, score in zip(customer_features['CustomerID'], scores) if other_id != customer_id],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    result[customer_id] = similar_customers

lookalike_df = pd.DataFrame({
    "CustomerID": result.keys(),
    "Recommendations": [str(rec) for rec in result.values()]
})

output_path = 'Data/Ashutosh_Chauhan_Lookalike.csv'
lookalike_df.to_csv(output_path, index=False)
print(f"Lookalike.csv has been saved at {output_path}")

Lookalike.csv has been saved at Data/Ashutosh_Chauhan_Lookalike.csv
