Import

In [3]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../data/raw/data.csv")

df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

snapshot_date = df["TransactionStartTime"].max() + pd.Timedelta(days=1)

RFM and Clustering

In [4]:
rfm = df.groupby("CustomerId").agg({
    "TransactionStartTime": lambda x: (snapshot_date - x.max()).days,
    "TransactionId": "count",
    "Amount": "sum"
}).rename(columns={"TransactionStartTime": "Recency", "TransactionId": "Frequency", "Amount": "Monetary"}).reset_index()

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

kmeans = KMeans(n_clusters=3, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

high_risk_cluster = rfm.groupby("Cluster")[["Recency", "Frequency", "Monetary"]].mean().sort_values("Frequency").index[0]
rfm["is_high_risk"] = (rfm["Cluster"] == high_risk_cluster).astype(int)

rfm[["CustomerId", "is_high_risk"]].value_counts()

CustomerId       is_high_risk
CustomerId_1     1               1
CustomerId_4310  1               1
CustomerId_4294  0               1
CustomerId_4297  0               1
CustomerId_4298  0               1
                                ..
CustomerId_2760  1               1
CustomerId_2761  0               1
CustomerId_2762  1               1
CustomerId_2764  0               1
CustomerId_998   0               1
Name: count, Length: 3742, dtype: int64

Merge with main data

In [5]:
df = df.merge(rfm[["CustomerId", "is_high_risk"]], on="CustomerId", how="left")
df.to_csv("../data/processed/labeled_data.csv", index=False)