In [1]:
from google.colab import files
uploaded = files.upload()

Saving Transactions.csv to Transactions.csv
Saving Products.csv to Products.csv
Saving Customers.csv to Customers.csv


In [2]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Convert dates
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets
df = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
# Signup duration
max_date = df['TransactionDate'].max()
customers['SignupDays'] = (max_date - customers['SignupDate']).dt.days

# Aggregates
agg = df.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    NumTransactions=('TransactionID', 'nunique'),
    AvgOrderValue=('TotalValue', 'mean')
).reset_index()

# Category-wise purchase count
category_pivot = pd.crosstab(df['CustomerID'], df['Category'])

# Combine all features
features = customers[['CustomerID', 'Region', 'SignupDays']]\
           .merge(agg, on='CustomerID')\
           .merge(category_pivot, on='CustomerID', how='left')

# Encode Region
features = pd.get_dummies(features, columns=['Region'])

# Save CustomerID separately
customer_ids = features['CustomerID']
X = features.drop(columns=['CustomerID'])

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cosine similarity
similarity_matrix = cosine_similarity(X_scaled)

In [6]:
top_lookalikes = {}
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

for i, cust_id in enumerate(customer_ids):
    if cust_id in target_customers:
        similarities = list(enumerate(similarity_matrix[i]))
        similarities = [(customer_ids[j], score) for j, score in similarities if customer_ids[j] != cust_id]
        top_3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
        top_lookalikes[cust_id] = [[str(c), round(float(s), 4)] for c, s in top_3]

In [7]:
lookalike_df = pd.DataFrame({'CustomerID': list(top_lookalikes.keys()),
                             'Lookalikes': list(top_lookalikes.values())})
lookalike_df.to_csv("Lookalike.csv", index=False)

# Download file
files.download("Lookalike.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>