# Campaign Engine Prototype

This notebook walks through each step of the LLM-powered campaign engine.

---

In [None]:
!pip install openai pandas scikit-learn faiss-cpu matplotlib ipywidgets

In [None]:
import os
import pandas as pd
import numpy as np
import faiss
import openai
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from io import StringIO

# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY", "<YOUR_API_KEY_HERE>")

## Step 1: Load & Clean Data

In [None]:
# Update the path if needed
DATA_PATH = "data/transactions.csv"
df = pd.read_csv(DATA_PATH)

# Normalize column names
if "Customer ID" in df.columns:
    df = df.rename(columns={"Customer ID": "user_id"})
elif "customer_id" in df.columns:
    df = df.rename(columns={"customer_id": "user_id"})

# Build event_text
df["event_text"] = df["Merchant"] + " | " + df["Category"] + " | ₹" + df["Amount (INR)"].astype(str)
df.head()

## Step 2: Sample Events & Embed

In [None]:
# Sample up to 5 most recent events per user
df_sorted = df.sort_values("Timestamp")
sampled = df_sorted.groupby("user_id").tail(5).reset_index(drop=True)

# Embed sampled texts
texts = sampled["event_text"].tolist()
resp = openai.Embedding.create(model="text-embedding-ada-002", input=texts)
embs = np.array([d["embedding"] for d in resp["data"]], dtype="float32")
sampled["_emb"] = list(embs)

# Aggregate per-user embedding
user_vecs = sampled.groupby("user_id")["_emb"].apply(lambda vs: np.mean(vs.tolist(), axis=0))
users = user_vecs.index.tolist()
matrix = np.vstack(user_vecs.values)
matrix.shape

## Step 3: Global Clustering

In [None]:
NUM_CLUSTERS = 5
km = KMeans(n_clusters=NUM_CLUSTERS, random_state=42).fit(matrix)
labels = km.labels_
centroids = km.cluster_centers_

# Display cluster sizes
pd.Series(labels).value_counts().sort_index()

## Step 4: Visualize Cluster Distribution

In [None]:
plt.bar(range(NUM_CLUSTERS), pd.Series(labels).value_counts().sort_index())
plt.xlabel("Cluster")
plt.ylabel("Number of Users")
plt.title("Global Cluster Sizes")
plt.show()

## Step 5: Campaign Goal & Cluster Ranking

In [None]:
campaign_goal = "Weekend getaway for flight & hotel bookers"
gv = openai.Embedding.create(model="text-embedding-ada-002", input=[campaign_goal])["data"][0]["embedding"]
sims = cosine_similarity([gv], centroids)[0]
ranked = np.argsort(sims)[::-1]
ranked[:3], sims[ranked[:3]]

## Step 6: Define Campaigns & Compute Propensities

In [None]:
campaign_csv = """campaign_id,description,cost
A,10% off flight booking,100
B,₹500 cashback on hotel,80
C,Buy-1-Get-1 ride voucher,60"""

camp_df = pd.read_csv(StringIO(campaign_csv))
camp_df
# Embed campaign descriptions
descs = camp_df["description"].tolist()
resp = openai.Embedding.create(model="text-embedding-ada-002", input=descs)
camp_embs = np.array([d["embedding"] for d in resp["data"]], dtype="float32")

# Compute propensities
sims_uc = cosine_similarity(matrix, camp_embs)
rows = []
for i, uid in enumerate(users):
    best_j = np.argmax(sims_uc[i])
    rows.append({
        "user_id": uid,
        "campaign_id": camp_df.loc[best_j,"campaign_id"],
        "propensity": float(sims_uc[i,best_j]),
        "cost": float(camp_df.loc[best_j,"cost"])
    })
assign_df = pd.DataFrame(rows)
assign_df.head()

## Step 7: Budget-Constrained Selection

In [None]:
total_budget = 100000
assign_df["roi"] = assign_df["propensity"] / assign_df["cost"]
df_sorted = assign_df.sort_values(by="roi", ascending=False)

spent = 0
keep = []
for _, row in df_sorted.iterrows():
    if spent + row["cost"] <= total_budget:
        keep.append(True)
        spent += row["cost"]
    else:
        keep.append(False)
df_sorted["selected"] = keep

print("Total spent:", spent)
df_sorted[df_sorted["selected"]].head()

## Step 8: LLM Rationales for Cohorts & Users

In [None]:
selected = df_sorted[df_sorted["selected"]]
for cluster_idx in ranked[:2]:
    seg_users = [users[i] for i,label in enumerate(labels) if label==cluster_idx]
    seg_selected = selected[selected["user_id"].isin(seg_users)]
    if seg_selected.empty:
        continue
    print(f"### Cohort {cluster_idx}")
    # Cohort rationale
    cohort_rationale = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role":"system","content":"You are a campaign reasoning assistant."},
            {"role":"user","content":
 f"""Campaign Goal: {campaign_goal}\nCohort Users: {seg_selected['user_id'].tolist()}\nWrite a one-sentence rationale."""}
        ]
    ).choices[0].message.content.strip()
    print("Cohort rationale:", cohort_rationale)
    # Per-user rationales
    for _, row in seg_selected.iterrows():
        usr, camp = row["user_id"], row["campaign_id"]
        user_rationale = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role":"system","content":"You are a campaign reasoning assistant."},
                {"role":"user","content":
 f"""User ID: {usr}\nEvent texts: {df[df['user_id']==usr]['event_text'].tolist()}\nAssigned Campaign: {camp}\nWrite a one-line rationale.
"""}
            ]
        ).choices[0].message.content.strip()
        print(usr, camp, user_rationale)
