# Simple Reinforcement Learning Pipeline

This notebook shows a clean, minimal RL pipeline for dynamic car pricing using the local `CarPricingEnv`.
It uses **tabular Q-learning** (no deep learning required) so each step is easy to understand and modify.


## 1) Imports and reproducibility


In [None]:
from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, List

import numpy as np
import matplotlib.pyplot as plt

from env import CarPricingEnv, build_constraint_fn, default_terminal_reward

rng = np.random.default_rng(7)


## 2) Define a small environment setup


In [None]:
# Synthetic baseline data for one item/listing
x: Dict = {"p0": 50000.0}

def p0_fn(features: Dict) -> float:
    return float(features["p0"])

def hazard_fn(features: Dict, t: int, delta: float) -> float:
    # Lower prices (negative delta) should sell faster; time pressure increases conversion.
    logits = -2.2 - 1.4 * delta + 0.12 * t
    return float(1 / (1 + np.exp(-logits)))

constraint_fn = build_constraint_fn(delta_min=-0.25, delta_max=0.25)
holding_cost = 60.0
t_max = 25

env = CarPricingEnv(
    hazard_fn=hazard_fn,
    p0_fn=p0_fn,
    constraint_fn=constraint_fn,
    holding_cost=holding_cost,
    t_max=t_max,
    rng=rng,
    terminal_reward_fn=default_terminal_reward,
)


## 3) Build a tiny tabular Q-learning agent


In [None]:
delta_grid = np.round(np.linspace(-0.25, 0.25, 21), 3)
action_set = np.array([-0.03, -0.01, 0.0, 0.01, 0.03], dtype=float)

state_index = {(t, float(d)): i for i, (t, d) in enumerate((t, d) for t in range(1, t_max + 1) for d in delta_grid)}
q_table = np.zeros((len(state_index), len(action_set)), dtype=float)

@dataclass
class TrainConfig:
    episodes: int = 4000
    alpha: float = 0.08
    gamma: float = 0.98
    eps_start: float = 1.0
    eps_end: float = 0.05

cfg = TrainConfig()

def discretize_delta(delta: float) -> float:
    idx = int(np.argmin(np.abs(delta_grid - delta)))
    return float(delta_grid[idx])

def epsilon(episode: int, total: int) -> float:
    progress = episode / max(total - 1, 1)
    return cfg.eps_start + progress * (cfg.eps_end - cfg.eps_start)

def pick_action(s_idx: int, eps: float) -> int:
    if rng.random() < eps:
        return int(rng.integers(len(action_set)))
    return int(np.argmax(q_table[s_idx]))


## 4) Train


In [None]:
episode_returns: List[float] = []

for ep in range(cfg.episodes):
    state = env.reset(x=x, delta=0.0)
    done = False
    total_reward = 0.0

    while not done:
        d_disc = discretize_delta(state.delta)
        s_idx = state_index[(state.t, d_disc)]

        a_idx = pick_action(s_idx, epsilon(ep, cfg.episodes))
        action = float(action_set[a_idx])

        step = env.step(action)
        next_state = step.state
        reward = step.reward
        done = step.done

        if done:
            td_target = reward
        else:
            nd_disc = discretize_delta(next_state.delta)
            ns_idx = state_index[(next_state.t, nd_disc)]
            td_target = reward + cfg.gamma * np.max(q_table[ns_idx])

        q_table[s_idx, a_idx] += cfg.alpha * (td_target - q_table[s_idx, a_idx])
        total_reward += reward
        state = next_state

    episode_returns.append(total_reward)

print(f"Training complete. Mean reward (last 200 episodes): {np.mean(episode_returns[-200:]):.2f}")


## 5) Quick diagnostics


In [None]:
window = 100
moving_avg = np.convolve(episode_returns, np.ones(window) / window, mode="valid")

plt.figure(figsize=(8, 4))
plt.plot(moving_avg)
plt.title("Training reward (moving average)")
plt.xlabel("Episode")
plt.ylabel("Average return")
plt.grid(alpha=0.3)
plt.show()


## 6) Run one greedy rollout (learned policy)


In [None]:
state = env.reset(x=x, delta=0.0)
done = False
trace = []
total_reward = 0.0

while not done:
    d_disc = discretize_delta(state.delta)
    s_idx = state_index[(state.t, d_disc)]
    a_idx = int(np.argmax(q_table[s_idx]))
    action = float(action_set[a_idx])
    result = env.step(action)

    trace.append({
        "t": state.t,
        "delta": state.delta,
        "action": action,
        "reward": result.reward,
        "sold": result.event,
    })

    total_reward += result.reward
    state = result.state
    done = result.done

trace[:10], total_reward


You now have a full RL loop in one notebook: environment setup, training, and evaluation.
To adapt this to production data, replace `hazard_fn` and `p0_fn` with model-backed functions and keep the training loop unchanged.
