In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Step 2: Merge the data
data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

# Step 3: Feature Engineering
# Aggregate transaction data for each customer
customer_features = data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_spent_per_transaction=("TotalValue", "mean"),
    unique_categories=("Category", lambda x: x.nunique())
).reset_index()

# Add regional data from customers.csv
customer_features = customer_features.merge(customers[["CustomerID", "Region"]], on="CustomerID", how="left")

# Encode categorical features (Region)
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ["total_spent", "total_transactions", "avg_spent_per_transaction", "unique_categories"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Step 4: Compute Cosine Similarity
similarity_matrix = cosine_similarity(customer_features[numerical_cols])

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# Step 5: Generate Lookalike Recommendations for the First 20 Customers
lookalike_dict = {}
for customer_id in customer_features["CustomerID"][:20]:
    # Get similarity scores for the current customer
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).drop(customer_id).head(3)
    lookalike_dict[customer_id] = list(similar_customers.index), list(similar_customers.values)

# Step 6: Save Lookalike Results to CSV
lookalike_output = []
for cust_id, (similar_ids, scores) in lookalike_dict.items():
    lookalike_output.append({
        "cust_id": cust_id,
        "lookalikes": [(sim_id, round(score, 2)) for sim_id, score in zip(similar_ids, scores)]
    })

lookalike_df = pd.DataFrame(lookalike_output)
lookalike_df.to_csv("Lookalike___.csv", index=False)

print("Lookalike Model results saved to Lookalike.csv")


Lookalike Model results saved to Lookalike.csv
