In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

df = pd.read_csv("data/processed/engineered_features.csv")

# Aggregate per channel
agg = df.groupby("channel").agg({
    "has_image": "mean",
    "hour": "mean",
    "text_length": "mean"
}).reset_index()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(agg.drop(columns="channel"))

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
agg["cluster"] = kmeans.fit_predict(X_scaled)

# Assume cluster with lowest activity is risky
agg["risk"] = (agg["cluster"] == agg.groupby("cluster")["has_image"].mean().idxmin()).astype(int)

# Merge back
df = df.merge(agg[["channel", "risk"]], on="channel")
df.to_csv("data/processed/labeled_risk.csv", index=False)
df.head()
