In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

Loading the datasets

In [2]:
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [10]:
# Rename the 'Price' column in either transactions or products before merging
transactions = transactions.rename(columns={'Price': 'TransactionPrice'})

# Now perform the merge
transactions = transactions.merge(products, on="ProductID", how="left")

Aggregate customer behavioral features

In [11]:
customer_behavior = transactions.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "ProductID": "nunique",  # Unique products purchased
    "Category": lambda x: x.mode()[0]  # Most purchased category
}).rename(columns={"TotalValue": "TotalSpending", "ProductID": "UniqueProducts"})

In [12]:
customers = customers.set_index("CustomerID").join(customer_behavior)

Feature Engineering

In [13]:
customers["Region"] = customers["Region"].astype("category").cat.codes
customers["SignupDays"] = (pd.to_datetime("2025-01-01") - pd.to_datetime(customers["SignupDate"])).dt.days
customers = customers.drop(columns=["SignupDate", "CustomerName"])

 Normalized features

In [15]:
# Assuming 'Category' is the only non-numeric column
numerical_features = customers.select_dtypes(include=['number'])

# Now apply StandardScaler only to numerical features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(numerical_features.fillna(0))

Computing similarity

In [16]:
similarity_matrix = cosine_similarity(normalized_features)

lookalike recommendations

In [17]:
lookalike_mapping = {}
for i, customer_id in enumerate(customers.index[:20]):  # For first 20 customers
    similar_indices = similarity_matrix[i].argsort()[::-1][1:4]  # Top 3 similar customers
    similar_scores = similarity_matrix[i][similar_indices]
    lookalike_mapping[customer_id] = [
        {"CustomerID": customers.index[j], "Score": similar_scores[k]}
        for k, j in enumerate(similar_indices)
    ]

In [25]:
# Flatten the lookalike data into a list of rows
lookalike_rows = []
for cust_id, lookalikes in lookalike_mapping.items():
    row = {"CustomerID": cust_id}
    for k, item in enumerate(lookalikes):
        row[f"Lookalike{k+1}"] = item["CustomerID"]
        row[f"Score{k+1}"] = item["Score"]
    lookalike_rows.append(row)

# Create the DataFrame
lookalike_df = pd.DataFrame(lookalike_rows)

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)
