<a href="https://colab.research.google.com/github/teja-1403/Zeotap-Data-Science-Assignment/blob/main/SaiTeja_TBV_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [3]:
# Step 1: Merge datasets
transactions = transactions.merge(customers, on="CustomerID", how="left")
transactions = transactions.merge(products, on="ProductID", how="left")

In [4]:
# Step 2: Feature Engineering
# Aggregate transaction history to create customer-product purchase matrix
customer_product_matrix = transactions.pivot_table(
    index="CustomerID",
    columns="ProductID",
    values="Quantity",
    aggfunc="sum",
    fill_value=0
)

In [5]:
# Add profile information (Region, SignupDate)
profile_data = customers.set_index("CustomerID")[["Region", "SignupDate"]]
profile_data["SignupDate"] = pd.to_datetime(profile_data["SignupDate"]).astype(int)  # Convert signup date to numerical format
ohe = OneHotEncoder()
region_encoded = ohe.fit_transform(profile_data[["Region"]]).toarray()
region_df = pd.DataFrame(region_encoded, index=profile_data.index, columns=ohe.get_feature_names_out(["Region"]))

In [9]:
# Combine product purchase data and profile data
combined_features = pd.concat([customer_product_matrix, region_df, profile_data[["SignupDate"]]], axis=1)
combined_features = combined_features.fillna(0)  # Replace NaN values with 0

In [10]:
# Step 3: Calculate Cosine Similarity
similarity_matrix = cosine_similarity(combined_features)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_features.index, columns=combined_features.index)

In [11]:
# Step 4: Get Top 3 Similar Customers for C0001 - C0020
lookalike_results = {}
for customer_id in customers["CustomerID"].head(20):  # First 20 customers (C0001 - C0020)
    similar_customers = (
        similarity_df[customer_id]
        .drop(index=customer_id)  # Exclude the customer itself
        .sort_values(ascending=False)  # Sort by similarity score
        .head(3)  # Top 3 similar customers
    )
    lookalike_results[customer_id] = list(similar_customers.items())

In [14]:
# Step 5: Save Lookalike Results to CSV
lookalike_output = []
for cust_id, similar_custs in lookalike_results.items():
    lookalike_output.append({
        "cust_id": cust_id,
        "similar_customers": [
            {"similar_cust_id": sc[0], "score": sc[1]} for sc in similar_custs
        ]
    })

In [16]:
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "similar_cust_id": similar_cust_id, "score": score}
    for cust_id, similar_customers in lookalike_results.items()
    for similar_cust_id, score in similar_customers
])

# Save to CSV
lookalike_df.to_csv("SaiTeja_TBV_Lookalike.csv", index=False)

print("Lookalike model results saved!")

Lookalike model results saved!
