In [1]:
import pandas as pd
import numpy as np

In [2]:
df_all = []

In [None]:
# original R4W tABLE
def get_volume_r4w(df, start_wk, end_wk):
    rolling_window_df = pd.DataFrame([
    {
        'rolling_order':i,
        'rolling_start': end - pd.Timedelta(weeks=4),
        'roling_end':end,
        'rolling_window_range_start': (end - pd.Timedelta(weeks=4)).strftime('%Y-%m-%d') + ' to ' + end.strftime('%Y-%m-%d'),
        'rollingw_window_range_end': end.strftime('%Y-%m-%d') + ' to ' + (end + pd.Timedelta(weeks=4)).strftime('%Y-%m-%d')
    }
    for i, end in enumerate(pd.date_range(start=start_wk, end = end_wk, freq='W-FRI'))])

    #step1 build all accounts x rolling_order combinations
    all_accounts = df_all['account_id'].unique()
    all_orders = rolling_window_df['rolling_order'].unique()
    account_order_grid = pd.MultiIndex.from_product([all_accounts, all_orders], 
                                                    names=['Account ID', 'rolling_order']).to_frame(index=False)
    
    #stept2: compute metrics only where data exists
    volume_records = []
    for acct in df_all['account_id'].unique():
        df_acct = df_all[df_all['account_id'] == acct]
        for _, window in rolling_window_df.iterrows():
            df_window = df_acct[(df_acc['week'] > window['rolling_start']) & (df_acc['week'] <= window['roling_end'])]
            if df_window.empty:
                continue
            a_stock = df_window['a_stock'].sum()
            b_stock = df_window['b_stock'].sum()
            c_stock = df_window['c_stock'].sum()
            total = a_stock + b_stock + c_stock
            a_freq = (df_window['a_stock'] > 0).sum() / df_window['week'].nunique()
            b_freq = (df_window['b_stock'] > 0).sum() / df_window['week'].nunique()
            #c_freq = (df_window['c_stock'] > 0).sum() / df_window['week'].nunique()
            ms_a = a_stock/total if total > 0 else None
            ms_b = b_stock/total if total > 0 else None
            volume_records.append({
                'Account ID': acct,
                'rolling_order': window['rolling_order'],
                'rolling_4w_a_stock': a_stock,
                'rolling_4w_b_stock': b_stock,
                'c_stock': c_stock,
                #'total_stock': total,
                'a_freq': a_freq,
                'b_freq': b_freq,
                #'c_freq': c_freq,
                'rolling_4w_a_ms': ms_a,
                'rolling_4w_a_ms': ms_b,
            })
    volume_partial = pd.DataFrame(volume_records)
    volume_r4w_full = account_order_grid.merge(volume_partial, on=['Account ID', 'rolling_order'], how='left')

    volume_r4w = volume_r4w_full.merge(rolling_window_df, on='rolling_order', how='left')
    metrics_cols = ['rolling_4w_a_stock', 'rolling_4w_b_stock', 'c_stock', 'a_freq', 'b_freq', 
                    'rolling_4w_a_ms', 'rolling_4w_a_ms']
    volume_r4w[metrics_cols] = volume_r4w[metrics_cols].fillna(0)
    volume_r4w = volume_r4w.sort_values(['Account ID', 'rolling_order'])
    volume_r4w = volume_r4w.merge(df_all[['account_id', 'account_name']].drop_duplicates(),
                                left_on='Account ID', right_on='account_id', how='left').drop(columns=['account_id'])
    print(df_all['account_id'].nunique(), volume_r4w['Account ID'].nunique(), len(volume_r4w.groupby(['Account ID'])['rolling_order'].nunique().unique().min()))
    return volume_r4w
    
    

In [None]:
# spcale up version:
import pandas as pd
import numpy as np

# ✅ 建议：开启 pandas 2.x Copy-on-Write（若版本支持），减少不必要拷贝
# Recommended: enable pandas 2.x copy-on-write (if your version supports it)
pd.options.mode.copy_on_write = True


def build_r4w_metrics(
    df: pd.DataFrame,
    start_wk: str | pd.Timestamp,
    end_wk: str | pd.Timestamp,
    *,
    week_col: str = "week",
    id_col: str = "account_id",
    name_col: str = "account_name",
    a_col: str = "a_stock",
    b_col: str = "b_stock",
    c_col: str = "c_stock",
    freq_denominator: str = "fixed4"  # "fixed4"（默认，分母恒等于4）或 "observed"（分母=窗口内实际观测周数）
) -> pd.DataFrame:
    """
    生成以“周五”为窗口终点的 R4W（滚动4周）指标（纯 pandas 向量化、高性能）
    Build R4W (rolling 4 weeks) metrics with Friday as window end (pure pandas, vectorized & fast)

    输入 Input
    -----
    df : 包含原始周度数据的 DataFrame；需包含：
         DataFrame with weekly rows; must contain:
         - id_col（门店/账户ID；e.g., "account_id"）
         - name_col（门店名；e.g., "account_name"）
         - week_col（周日期；可为YYYYMMDD/字符串/日期；会对齐到该周“周五”）
         - a_col / b_col（A/B品类销量）
         - c_col（可选；若不存在将按0处理）
    start_wk, end_wk : 统计区间（窗口终点的起/止周五）；如 "2025-01-03", "2025-10-03"
                       Friday range for window ends, e.g. "2025-01-03" to "2025-10-03"
    freq_denominator : "fixed4" 使用固定分母4（推荐，稳定）；"observed" 使用窗口内实际观测周数作为分母
                       Use "fixed4" (recommended) or "observed" for frequency denominator.

    输出 Output
    -----
    返回含以下列的 DataFrame（只保留窗口终点位于 [start_wk, end_wk] 的周五）：
    Returns a DataFrame with Friday window ends within [start_wk, end_wk], including:
      - {id_col}, {name_col}, {week_col}（窗口终点 Friday / window end Friday）
      - rolling_order（窗口序号，按时间1..N / sequential order 1..N）
      - rolling_start, rolling_end（窗口起止；(end-4周, end] / window bounds）
      - rolling_window_range_start / rolling_window_range_end（可读范围 / human-readable ranges）
      - rolling_4w_a_stock / rolling_4w_b_stock / rolling_4w_c_stock（近4周各品类总量 / 4-week sums）
      - total_stock（近4周总量 / 4-week total）
      - a_freq / b_freq / c_freq（近4周“>0的周”占比 / share of positive weeks in last 4）
      - rolling_4w_a_ms / rolling_4w_b_ms（近4周份额 / market share）
    """

    # ---------- 0) 选择所需列 & 统一周日期到“周五” ----------
    # Keep necessary columns only; unify week to the Friday of each week
    base_cols = [id_col, name_col, week_col, a_col, b_col]
    if c_col in df.columns:
        base_cols.append(c_col)
    df = df[base_cols].copy()

    # 将各种 week 表示（YYYYMMDD/字符串/日期）统一为 Timestamp，并对齐到该周周五
    # Normalize 'week' to Timestamp and align to Friday of the week (W-FRI)
    if not np.issubdtype(df[week_col].dtype, np.datetime64):
        s = df[week_col].astype(str)
        is_ymd = s.str.len().eq(8) & s.str.match(r"^\d{8}$")
        df.loc[is_ymd, week_col] = pd.to_datetime(s[is_ymd], format="%Y%m%d", errors="coerce")
        df.loc[~is_ymd, week_col] = pd.to_datetime(s[~is_ymd], errors="coerce")
    df[week_col] = df[week_col].dt.to_period("W-FRI").dt.to_timestamp("W-FRI")

    # ---------- 1) 缺失列与类型压缩 ----------
    # Fill missing c_col with 0 if absent; compress numeric dtypes
    if c_col not in df.columns:
        df[c_col] = 0

    for col in (a_col, b_col, c_col):
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype("Int32")

    # ---------- 2) 生成窗口终点（周五）序列 ----------
    # Build list of Friday window ends within [start_wk, end_wk]
    start_wk = pd.to_datetime(start_wk)
    end_wk = pd.to_datetime(end_wk)
    friday_ends = pd.date_range(start=start_wk, end=end_wk, freq="W-FRI")

    # 由于 rolling(4) 需要往前3周的数据，扩展一个最早日期 earliest_needed
    # For rolling(4), we need 3 more prior weeks
    earliest_needed = friday_ends.min() - pd.Timedelta(weeks=3)

    # ---------- 3) 聚合为“账号×周五”的周表，并对每个账号补齐完整周历 ----------
    # Aggregate to weekly (Friday) per account, then reindex each account to a full Friday calendar
    weekly = (
        df.groupby([id_col, name_col, week_col], as_index=False)[[a_col, b_col, c_col]].sum()
    )

    def _reindex_one(g: pd.DataFrame) -> pd.DataFrame:
        idx = pd.date_range(start=earliest_needed, end=end_wk, freq="W-FRI")
        g = g.set_index(week_col).reindex(idx, fill_value=0)
        g.index.name = week_col
        g = g.reset_index()
        g[id_col] = g[id_col].iloc[0]
        g[name_col] = g[name_col].iloc[0]
        return g[[id_col, name_col, week_col, a_col, b_col, c_col]]

    weekly_full = (
        weekly.groupby([id_col, name_col], group_keys=False)
              .apply(_reindex_one)
              .reset_index(drop=True)
              .sort_values([id_col, week_col])
    )

    # ---------- 4) 计算 R4W 的 sum / freq / ms（全向量化） ----------
    # Compute rolling-4-week sums, positive frequencies, and market shares (vectorized)
    gb = weekly_full.groupby(id_col, group_keys=False)

    # 近4周 sum（A/B/C）
    a_4 = gb[a_col].rolling(4, min_periods=1).sum().reset_index(drop=True)
    b_4 = gb[b_col].rolling(4, min_periods=1).sum().reset_index(drop=True)
    c_4 = gb[c_col].rolling(4, min_periods=1).sum().reset_index(drop=True)

    weekly_full["rolling_4w_a_stock"] = a_4.astype("float")
    weekly_full["rolling_4w_b_stock"] = b_4.astype("float")
    weekly_full["rolling_4w_c_stock"] = c_4.astype("float")
    weekly_full["total_stock"] = weekly_full["rolling_4w_a_stock"] + weekly_full["rolling_4w_b_stock"] + weekly_full["rolling_4w_c_stock"]

    # 近4周“有货周”计数（>0）/ count of positive weeks in the last 4
    a_pos = gb[a_col].apply(lambda s: (s > 0).rolling(4, min_periods=1).sum()).reset_index(drop=True)
    b_pos = gb[b_col].apply(lambda s: (s > 0).rolling(4, min_periods=1).sum()).reset_index(drop=True)
    c_pos = gb[c_col].apply(lambda s: (s > 0).rolling(4, min_periods=1).sum()).reset_index(drop=True)

    # 频率分母：fixed4 = 4；observed = 窗口内实际观测周数（首个窗口可能<4）
    # Frequency denominator: fixed4=4; observed=actual observed weeks in the window (first windows may be <4)
    if freq_denominator == "observed":
        denom = (gb[week_col].cumcount() + 1).clip(upper=4)
    else:
        denom = pd.Series(4, index=weekly_full.index)

    weekly_full["a_freq"] = (a_pos / denom).astype("float")
    weekly_full["b_freq"] = (b_pos / denom).astype("float")
    weekly_full["c_freq"] = (c_pos / denom).astype("float")

    # 近4周份额（market share）；total=0 时置为 0，避免 NaN/inf
    # 4-week market shares; if total==0, set to 0.0
    with np.errstate(divide="ignore", invalid="ignore"):
        weekly_full["rolling_4w_a_ms"] = np.where(
            weekly_full["total_stock"] > 0,
            weekly_full["rolling_4w_a_stock"] / weekly_full["total_stock"],
            0.0,
        )
        weekly_full["rolling_4w_b_ms"] = np.where(
            weekly_full["total_stock"] > 0,
            weekly_full["rolling_4w_b_stock"] / weekly_full["total_stock"],
            0.0,
        )

    # ---------- 5) 仅保留窗口终点位于 [start_wk, end_wk] 的周五，并生成 rolling_order ----------
    # Keep only Friday ends within [start_wk, end_wk] and add rolling_order
    out = weekly_full[weekly_full[week_col].isin(friday_ends)].copy()
    out = out.sort_values([id_col, week_col])

    ro_map = {d: i + 1 for i, d in enumerate(sorted(friday_ends))}
    out["rolling_order"] = out[week_col].map(ro_map).astype("Int32")

    # 窗口边界与范围（(end-4周, end]）
    # Window bounds and human-readable ranges
    out["rolling_end"] = out[week_col]
    out["rolling_start"] = out["rolling_end"] - pd.Timedelta(weeks=4)
    out["rolling_window_range_start"] = (
        out["rolling_start"].dt.strftime("%Y-%m-%d") + " to " + out["rolling_end"].dt.strftime("%Y-%m-%d")
    )
    out["rolling_window_range_end"] = (
        out["rolling_end"].dt.strftime("%Y-%m-%d") + " to " + (out["rolling_end"] + pd.Timedelta(weeks=4)).dt.strftime("%Y-%m-%d")
    )

    # 输出列顺序 / Final column order
    keep_cols = [
        id_col, name_col, week_col, "rolling_order",
        "rolling_start", "rolling_end",
        "rolling_window_range_start", "rolling_window_range_end",
        "rolling_4w_a_stock", "rolling_4w_b_stock", "rolling_4w_c_stock",
        "total_stock", "a_freq", "b_freq", "c_freq",
        "rolling_4w_a_ms", "rolling_4w_b_ms",
    ]
    out = out.loc[:, keep_cols]

    return out
