In [None]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from pathlib import Path

sys.path.append(os.path.abspath(".."))
project_root = Path().resolve().parent
os.chdir(project_root)
current_dir = os.getcwd()

from src.config import (
    RAW_FILENAME,
    TEXT_COL,
    NUMERIC_COLS,
    CATEGORIC_COLS,
    TARGET,
    TEST_SIZE,
    RANDOM_STATE,
)

In [None]:
data_path = os.path.join(current_dir, "data", "raw", RAW_FILENAME)
df = pd.read_csv(data_path)

In [None]:
df.head(5)

In [None]:
df[["CATEGORY_LVL_1", "CATEGORY_LVL_2"]] = (
    df["CATEGORY"].str.split(":", n=1, expand=True).apply(lambda x: x.str.strip())
)
df["DATE_EMITTED"] = pd.to_datetime(df["DATE_EMITTED"], errors="coerce")
df["DESCRIPTION_EMPTY"] = df["DESCRIPTION"].isna() | (
    df["DESCRIPTION"].str.strip() == ""
)
df["LOG_AMOUNT"] = np.log1p(df["AMOUNT"].abs())

DAYS_FR = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

df["DAY_OF_WEEK"] = df["DATE_EMITTED"].dt.dayofweek
df["DAY_OF_WEEK_LABEL"] = df["DAY_OF_WEEK"].map(dict(enumerate(DAYS_FR)))
df["DAY_OF_MONTH"] = df["DATE_EMITTED"].dt.day
df["MONTH_THIRD"] = pd.cut(
    df["DAY_OF_MONTH"],
    bins=[0, 10, 20, 31],
    labels=["Beginning (1-10)", "Middle (11-20)", "End (21-31)"],
)

In [None]:
df.info()

In [None]:
fig_path = os.path.join(current_dir, "figures", "EDA")
os.makedirs(fig_path, exist_ok=True)


def save(fig, name):
    fig.savefig(os.path.join(fig_path, name), bbox_inches="tight", dpi=150)

In [None]:
cat_counts = df["CATEGORY"].value_counts()
fig, ax = plt.subplots(figsize=(10, 12))

bars = sns.barplot(
    x=cat_counts.values,
    y=cat_counts.index,
    palette="Blues_r",
    ax=ax,
)
for bar, count in zip(ax.patches, cat_counts.values):
    pct = count / len(df) * 100
    ax.text(
        bar.get_width() + len(df) * 0.002,
        bar.get_y() + bar.get_height() / 2,
        f"{pct:.1f}%",
        va="center",
        fontsize=9,
    )

ax.set_title("Categories distribution", fontsize=13, fontweight="bold")
ax.set_xlabel("Number of transactions")
ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{int(x):,}"))
save(fig, "category_distribution.png")

In [None]:
empty_rate = (
    df.groupby("CATEGORY")["DESCRIPTION_EMPTY"].mean().sort_values(ascending=False)
    * 100
)

fig, ax = plt.subplots(figsize=(10, 12))
sns.barplot(x=empty_rate.values, y=empty_rate.index, ax=ax)
ax.set_title("% empty description per category", fontsize=13, fontweight="bold")
ax.set_xlabel("% empty descriptions")
ax.legend()
save(fig, "empty_description_rate.png")

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
order = df.groupby("CATEGORY")["AMOUNT"].median().sort_values(ascending=False).index

sns.boxplot(
    data=df,
    x="AMOUNT",
    y="CATEGORY",
    order=order,
    palette="coolwarm",
    flierprops=dict(marker=".", markersize=2, alpha=0.4),
    ax=ax,
)
ax.set_title(
    "Distribution of transaction amount pet cateogry", fontsize=13, fontweight="bold"
)
ax.set_xlabel("Amount (€)")

p95 = df["AMOUNT"].quantile(0.95)
p05 = df["AMOUNT"].quantile(0.05)
ax.set_xlim(p05 * 1.5, p95 * 1.5)
save(fig, "amount_by_category_boxplot.png")

In [None]:
pivot = df.groupby(["CATEGORY", "TYPE_OF_PAYMENT"]).size().unstack(fill_value=0)
pivot_pct = pivot.div(pivot.sum(axis=1), axis=0) * 100

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(
    pivot_pct,
    annot=True,
    fmt=".0f",
    cmap="YlOrRd",
    linewidths=0.5,
    cbar_kws={"label": "% of transactions"},
    ax=ax,
)
ax.set_title("Paiment type × Category (% row)", fontsize=13, fontweight="bold")
ax.set_xlabel("Payment type")
save(fig, "payment_type_heatmap.png")

In [None]:
side_pct_norm = (
    df.groupby(["CATEGORY", "SIDE"])
    .size()
    .unstack(fill_value=0)
    .rename(columns=lambda x: "Débit" if x == 0 else "Crédit")
    .pipe(lambda df: df.div(df.sum(axis=1), axis=0) * 100)
    .sort_values("Crédit" if "Crédit" in df["SIDE"].unique() else "Débit")
)

fig, ax = plt.subplots(figsize=(10, 12))
side_pct_norm.plot(
    kind="barh",
    stacked=True,
    ax=ax,
    width=0.7,
)
ax.axvline(50, color="black", linestyle="--", linewidth=0.8)
ax.set_title("Debit / Credit ratio per category", fontsize=13, fontweight="bold")
ax.set_xlabel("% of transactions")
ax.set_ylabel("")
ax.legend(loc="lower right")
save(fig, "debit_credit_ratio.png")

In [None]:
# Pour chaque catégorie, % de transactions couvertes par le top-10 marchands
coverage = []
for cat, group in df.groupby("CATEGORY"):
    top10_count = group["MERCHANT_NAME"].value_counts().head(10).sum()
    coverage.append(
        {"CATEGORY": cat, "TOP10_COVERAGE_PCT": top10_count / len(group) * 100}
    )
coverage_df = pd.DataFrame(coverage).sort_values("TOP10_COVERAGE_PCT", ascending=True)

fig, ax = plt.subplots(figsize=(10, 12))
sns.barplot(x="TOP10_COVERAGE_PCT", y="CATEGORY", data=coverage_df, ax=ax)
ax.axvline(50, color="red", linestyle="--", linewidth=1.2, label="50% threshold")
ax.set_title(
    "% of transactions covered by the top 10 merchants", fontsize=13, fontweight="bold"
)
ax.set_xlabel("Coverage by the top 10 (%)")
ax.set_ylabel("")
ax.legend()
save(fig, "merchant_coverage_by_category.png")

In [None]:
num_features = df[["AMOUNT", "LOG_AMOUNT", "SIDE", "DESCRIPTION_EMPTY"]].copy()
num_features["PAYMENT_ENCODED"] = df["TYPE_OF_PAYMENT"].astype("category").cat.codes
num_features["DAY_OF_WEEK"] = df["DATE_EMITTED"].dt.dayofweek
num_features["MONTH_NUM"] = df["DATE_EMITTED"].dt.month

corr = num_features.corr()

fig, ax = plt.subplots(figsize=(8, 6))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(
    corr,
    mask=mask,
    annot=True,
    fmt=".2f",
    cmap="RdBu_r",
    center=0,
    vmin=-1,
    vmax=1,
    linewidths=0.5,
    ax=ax,
)
ax.set_title("Correlation of numerical features", fontsize=13, fontweight="bold")
save(fig, "numerical_correlation.png")

In [None]:
nb_cat = 20

df["MONTH"] = df["DATE_EMITTED"].dt.to_period("M").astype(str)
monthly = df.groupby(["MONTH", "CATEGORY"]).size().unstack(fill_value=0)

top_x = cat_counts.head(nb_cat).index.tolist()
monthly_top_x = monthly[top_x]

fig, ax = plt.subplots(figsize=(14, 5))
monthly_top_x.plot(ax=ax, marker="o", markersize=3, linewidth=1.5)
ax.set_title(
    f"Monthly volumes — Top {nb_cat} categories", fontsize=13, fontweight="bold"
)
ax.set_xlabel("Month")
ax.set_ylabel("Nb transactions")
ax.tick_params(axis="x", rotation=45)
ax.legend(bbox_to_anchor=(1.01, 1), loc="upper left", fontsize=9)
save(fig, "monthly_transaction_volumes.png")

In [None]:
TOP_N = 20
top_cats = df["CATEGORY"].value_counts().head(TOP_N).index.tolist()
df_top = df[df["CATEGORY"].isin(top_cats)]

In [None]:
pivot_dow = (
    df_top.groupby(["CATEGORY", "DAY_OF_WEEK_LABEL"])
    .size()
    .unstack(fill_value=0)
    .reindex(columns=DAYS_FR, fill_value=0)
)
pivot_dow_pct = pivot_dow.div(pivot_dow.sum(axis=1), axis=0) * 100

fig, ax = plt.subplots(figsize=(11, 15))
sns.heatmap(
    pivot_dow_pct,
    annot=True,
    fmt=".0f",
    cmap="YlOrRd",
    linewidths=0.5,
    cbar_kws={"label": "% transactions"},
    ax=ax,
)
ax.set_title(
    f"Weekly transaction distribution per category — Top {TOP_N}",
    fontsize=13,
    fontweight="bold",
)
ax.set_xlabel("")
save(fig, "weekly_transaction_distribution.png")

In [None]:
pivot_dom = df_top.groupby(["CATEGORY", "MONTH_THIRD"]).size().unstack(fill_value=0)
pivot_dom_pct = pivot_dom.div(pivot_dom.sum(axis=1), axis=0) * 100

fig, ax = plt.subplots(figsize=(9, 15))
sns.heatmap(
    pivot_dom_pct,
    annot=True,
    fmt=".0f",
    cmap="YlOrRd",
    linewidths=0.5,
    cbar_kws={"label": "% category transactions"},
    ax=ax,
)
ax.set_title(
    f"Monthly transaction distribution per category — Top {TOP_N}\n",
    fontsize=13,
    fontweight="bold",
)
ax.set_xlabel("")
ax.set_ylabel("")
save(fig, "monthly_transaction_distribution.png")