In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('../data/raw/generalized_raw_input_for_dashboard.xlsx')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   entrp_ptnt_id  19 non-null     int64         
 1   store_nbr      18 non-null     float64       
 2   sale_dt        19 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 588.0 bytes


In [4]:
df.head(18)

Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt
0,1001,110.0,2025-12-07 09:00:00
1,1002,120.0,2025-05-26 10:00:00
2,1002,120.0,2025-11-22 11:00:00
3,1003,130.0,2023-06-26 09:00:00
4,1003,130.0,2025-11-12 15:00:00
5,1004,140.0,2025-05-26 08:30:00
6,1005,150.0,2021-11-03 12:00:00
7,1006,160.0,2025-12-02 13:00:00
8,1007,170.0,2025-12-10 09:00:00
9,1007,171.0,2025-12-10 18:00:00


In [5]:
# Sort data by patient ID (entrp_ptnt_id)
df_sorted = df.sort_values(by=['entrp_ptnt_id', 'sale_dt'], ascending=[True, True])
print(f"Data sorted by 'entrp_ptnt_id'")
print(f"Original data shape: {df.shape}")
print(f"Sorted data shape: {df_sorted.shape}")

print(f"\nFirst 10 rows of sorted data:")
print(df_sorted.head(10))

print(f"\nLast 10 rows of sorted data:")
print(df_sorted.tail(10))

# Update the original dataframe with sorted data
op_df = df_sorted
print(f"\nOriginal dataframe 'df' has been updated with sorted data")

Data sorted by 'entrp_ptnt_id'
Original data shape: (19, 3)
Sorted data shape: (19, 3)

First 10 rows of sorted data:
   entrp_ptnt_id  store_nbr             sale_dt
0           1001      110.0 2025-12-07 09:00:00
1           1002      120.0 2025-05-26 10:00:00
2           1002      120.0 2025-11-22 11:00:00
3           1003      130.0 2023-06-26 09:00:00
4           1003      130.0 2025-11-12 15:00:00
5           1004      140.0 2025-05-26 08:30:00
6           1005      150.0 2021-11-03 12:00:00
7           1006      160.0 2025-12-02 13:00:00
8           1007      170.0 2025-12-10 09:00:00
9           1007      171.0 2025-12-10 18:00:00

Last 10 rows of sorted data:
    entrp_ptnt_id  store_nbr             sale_dt
9            1007      171.0 2025-12-10 18:00:00
10           1008      180.0 2024-04-21 10:00:00
11           1008      180.0 2025-02-15 11:00:00
12           1009        NaN 2025-10-23 09:00:00
13           1009      190.0 2025-12-11 16:00:00
14           1010      200.0 2

In [6]:
df_sorted = df.sort_values(by=['entrp_ptnt_id', 'sale_dt'], ascending=[True, True])
op_df = df_sorted.copy()

In [7]:
"""Implementing the dummy row addition logic"""
# get first sale per patient (use min to ensure earliest first sale)
first_sales = op_df.groupby('entrp_ptnt_id', as_index=False)['sale_dt'].min()
first_sales['sale_dt'] = pd.to_datetime(first_sales['sale_dt'])  # normalize to datetime

# build new_rows: sale_dt = 3 years prior to the first sale
new_rows = first_sales.rename(columns={'sale_dt': 'orig_first_sale'})
new_rows['sale_dt'] = new_rows['orig_first_sale'] - pd.DateOffset(years=3)
new_rows = new_rows.drop(columns=['orig_first_sale'])

# Reindex to have the same columns as op_df (this will create missing cols with NaN)
new_rows = new_rows.reindex(columns=op_df.columns)

# Ensure sale_dt dtype matches op_df['sale_dt'] dtype
if pd.api.types.is_datetime64_any_dtype(op_df['sale_dt']):
    new_rows['sale_dt'] = pd.to_datetime(new_rows['sale_dt'])
else:
    # if op_df.sale_dt is plain date objects, keep date
    new_rows['sale_dt'] = pd.to_datetime(new_rows['sale_dt']).dt.date

# set required columns to defaults:
# - status = 'Unknown'
# - prev_store_nbr and recent_status should be NULL (pd.NA) per spec (prev_store_nbr = NULL before first purchase)
new_rows['status'] = 'Unknown'
new_rows['prev_store_nbr'] = pd.NA
new_rows['recent_status'] = pd.NA

# Concatenate and re-sort so the new rows come before the first actual sale
op_df = pd.concat([op_df, new_rows], ignore_index=True, sort=False)
op_df = op_df.sort_values(['entrp_ptnt_id', 'sale_dt']).reset_index(drop=True)

op_df['store_nbr'] = pd.to_numeric(op_df['store_nbr'], errors='coerce').astype('Int64')
#op_df.head(20)



In [8]:
"""DAYS SINCE PREVIOUS PURCHASE LOGIC"""
#retrieve data of specific patient
#print(op_df[op_df['entrp_ptnt_id'] == 1001])

op_df['sale_dt'] = pd.to_datetime(op_df['sale_dt'])


#new column for first purchase
#op_df['first_purchase'] = op_df.groupby('entrp_ptnt_id')['sale_dt'].transform('min')

#new column with days since previous purchase
op_df['prev_purchase_date'] = op_df.groupby('entrp_ptnt_id')['sale_dt'].shift(1)
op_df['days_since_prev_purchase'] = (op_df['sale_dt'] - op_df['prev_purchase_date']).dt.days.astype('Int64')

print()
print(op_df.head())


   entrp_ptnt_id  store_nbr             sale_dt   status prev_store_nbr  \
0           1001       <NA> 2022-12-07 09:00:00  Unknown            NaN   
1           1001        110 2025-12-07 09:00:00      NaN            NaN   
2           1002       <NA> 2022-05-26 10:00:00  Unknown            NaN   
3           1002        120 2025-05-26 10:00:00      NaN            NaN   
4           1002        120 2025-11-22 11:00:00      NaN            NaN   

  recent_status  prev_purchase_date  days_since_prev_purchase  
0           NaN                 NaT                      <NA>  
1           NaN 2022-12-07 09:00:00                      1096  
2           NaN                 NaT                      <NA>  
3           NaN 2022-05-26 10:00:00                      1096  
4           NaN 2025-05-26 10:00:00                       180  


In [None]:
from datetime import timedelta

def add_status_and_recent_status(op_df, as_of_date=None):
    df = op_df.copy()
    df["sale_dt"] = pd.to_datetime(df["sale_dt"], errors="coerce")

    # normalize as_of_date
    if as_of_date is None:
        as_of_date = pd.Timestamp.today().normalize()
    else:
        as_of_date = pd.to_datetime(as_of_date).normalize()

    # 1) Identify dummy rows (keep them untouched)
    dummy_mask = (df.get("status") == "Unknown") & (df.get("recent_status").isna())
    df_dummy = df[dummy_mask].copy()
    df_real = df[~dummy_mask].copy()

    df_real = df_real.sort_values(["entrp_ptnt_id", "sale_dt"])

    out_rows = []

    # boundaries: offsets (in days) and corresponding status
    boundaries = [
        (111, "Inactive"),
        (366, "Lapsed"),
        (1096, "Lost"),
    ]

    for pid, g in df_real.groupby("entrp_ptnt_id"):
        g = g.sort_values("sale_dt")

        if g["sale_dt"].isna().all():
            continue

        last_purchase_date = None  # normalized date
        last_store = None
        first = True

        for _, row in g.iterrows():
            sale_dt = row["sale_dt"]
            if pd.isna(sale_dt):
                continue  # skip invalid rows

            sale_date = sale_dt.normalize()
            store = row.get("store_nbr", last_store)

            if first:
                # First real purchase
                base = row.to_dict()
                base["status"] = "Active"
                base["recent_status"] = "Recently New"
                out_rows.append(base)
                first = False
            else:
                # gap from previous purchase to this purchase
                gap_days = (sale_date - last_purchase_date).days

                # --- synthetic rows between last_purchase_date and this purchase ---
                for offset, status_val in boundaries:
                    boundary_date = last_purchase_date + timedelta(days=offset)
                    # Only create synthetic row if boundary happened before next purchase
                    # AND boundary has already occurred (<= as_of_date)
                    if (boundary_date < sale_date) and (boundary_date <= as_of_date):
                        synth = {col: pd.NA for col in df_real.columns}
                        synth["entrp_ptnt_id"] = pid
                        synth["sale_dt"] = boundary_date
                        synth["status"] = status_val
                        synth["recent_status"] = "Inactive Continuing"
                        synth["store_nbr"] = last_store
                        out_rows.append(synth)

                # --- logic for the purchase row itself ---
                if gap_days <= 365:
                    recent = "Active Continuing"
                elif gap_days <= 1095:
                    recent = "Recently Reactivated"
                else:
                    recent = "Recently New"

                base = row.to_dict()
                base["status"] = "Active"
                base["recent_status"] = recent
                out_rows.append(base)

            last_purchase_date = sale_date
            last_store = store

        # --- AFTER last purchase: synthetic rows up to as_of_date ---
        if last_purchase_date is not None and last_purchase_date < as_of_date:
            # 1) boundaries (Inactive/Lapsed/Lost) strictly BEFORE as_of_date
            for offset, status_val in boundaries:
                boundary_date = last_purchase_date + timedelta(days=offset)
                if (boundary_date < as_of_date):
                    synth = {col: pd.NA for col in df_real.columns}
                    synth["entrp_ptnt_id"] = pid
                    synth["sale_dt"] = boundary_date
                    synth["status"] = status_val
                    synth["recent_status"] = "Inactive Continuing"
                    synth["store_nbr"] = last_store
                    out_rows.append(synth)

            # 2) final synthetic row exactly at as_of_date (represents today's snapshot)
            days_since_last = (as_of_date - last_purchase_date).days

            # map days_since_last to status_today (same thresholds as boundaries)
            if days_since_last <= 110:
                status_today = "Active"
            elif days_since_last <= 365:
                status_today = "Inactive"
            elif days_since_last <= 1095:
                status_today = "Lapsed"
            else:
                status_today = "Lost"

            if days_since_last <= 365:
                recent_today = "Active Continuing"
            elif days_since_last <= 1095:
                recent_today = "Recently Reactivated"
            else:
                recent_today = "Recently New"

            synth_today = {col: pd.NA for col in df_real.columns}
            synth_today["entrp_ptnt_id"] = pid
            synth_today["sale_dt"] = as_of_date
            synth_today["status"] = status_today
            synth_today["recent_status"] = recent_today
            synth_today["store_nbr"] = last_store
            out_rows.append(synth_today)

    df_status = pd.DataFrame(out_rows)

    # 2) Combine dummy rows + status-calculated rows
    combined = pd.concat([df_dummy, df_status], ignore_index=True, sort=False)

    # 3) Sort again by patient + sale_dt
    combined = combined.sort_values(["entrp_ptnt_id", "sale_dt"]).reset_index(drop=True)

    return combined


In [10]:
out = add_status_and_recent_status(op_df)
#retrieve data of specific patient
#print(out[out['entrp_ptnt_id'] == 1004])
out.head(15)



  combined = pd.concat([df_dummy, df_status], ignore_index=True, sort=False)


Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,prev_store_nbr,recent_status,prev_purchase_date,days_since_prev_purchase
0,1001,,2022-12-07 09:00:00,Unknown,,,,
1,1001,110.0,2025-12-07 09:00:00,Active,,Recently New,2022-12-07 09:00:00,1096.0
2,1001,110.0,2025-12-12 00:00:00,Active,,Active Continuing,,
3,1002,,2022-05-26 10:00:00,Unknown,,,,
4,1002,120.0,2025-05-26 10:00:00,Active,,Recently New,2022-05-26 10:00:00,1096.0
5,1002,120.0,2025-09-14 00:00:00,Inactive,,Inactive Continuing,,
6,1002,120.0,2025-11-22 11:00:00,Active,,Active Continuing,2025-05-26 10:00:00,180.0
7,1002,120.0,2025-12-12 00:00:00,Active,,Active Continuing,,
8,1003,,2020-06-26 09:00:00,Unknown,,,,
9,1003,130.0,2023-06-26 09:00:00,Active,,Recently New,2020-06-26 09:00:00,1095.0


In [11]:
def transition_date():
    # compute previous recent_status within each patient
    out['prev_recent_status'] = out.groupby('entrp_ptnt_id')['recent_status'].shift(1)

    # If recent_status is Unknown
    unknwn = out['recent_status'] == 'Unknown'
    out.loc[unknwn, 'transition_dt'] = "Unknown"

    #other rows
    change = (out['recent_status'] != out['prev_recent_status']) & (~unknwn)
    out.loc[change, 'transition_dt'] = out.loc[change, 'sale_dt'].dt.date

    # If no change then NULL
    no_change_mask = (out['recent_status'] == out['prev_recent_status']) & (~unknwn)
    out.loc[no_change_mask, 'transition_dt'] = "NULL"

    # drop helper column
    out.drop(columns=['prev_recent_status'], inplace=True)

    return out


out = transition_date()
out.head(15)
    

Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,prev_store_nbr,recent_status,prev_purchase_date,days_since_prev_purchase,transition_dt
0,1001,,2022-12-07 09:00:00,Unknown,,,,,2022-12-07
1,1001,110.0,2025-12-07 09:00:00,Active,,Recently New,2022-12-07 09:00:00,1096.0,2025-12-07
2,1001,110.0,2025-12-12 00:00:00,Active,,Active Continuing,,,2025-12-12
3,1002,,2022-05-26 10:00:00,Unknown,,,,,2022-05-26
4,1002,120.0,2025-05-26 10:00:00,Active,,Recently New,2022-05-26 10:00:00,1096.0,2025-05-26
5,1002,120.0,2025-09-14 00:00:00,Inactive,,Inactive Continuing,,,2025-09-14
6,1002,120.0,2025-11-22 11:00:00,Active,,Active Continuing,2025-05-26 10:00:00,180.0,2025-11-22
7,1002,120.0,2025-12-12 00:00:00,Active,,Active Continuing,,,
8,1003,,2020-06-26 09:00:00,Unknown,,,,,2020-06-26
9,1003,130.0,2023-06-26 09:00:00,Active,,Recently New,2020-06-26 09:00:00,1095.0,2023-06-26


In [12]:
def effective_date(out):

    out['prev_recent_status'] = out.groupby('entrp_ptnt_id')['recent_status'].shift(1)
    out['prev_store_nbr'] = out.groupby('entrp_ptnt_id')['store_nbr'].shift(1)

    rs_curr = out['recent_status'].fillna('__NA__')
    rs_prev = out['prev_recent_status'].fillna('__NA__')
    

    sn_curr = out['store_nbr'].astype('string').fillna('__NA__')
    sn_prev = out['prev_store_nbr'].astype('string').fillna('__NA__')

    unknown_mask = out['recent_status'] == 'Unknown'

    change_mask = (
        (rs_curr != rs_prev) |
        (sn_curr != sn_prev) |
        out['prev_recent_status'].isna()    # ensures first row per patient is marked 
    )

    sale_dt_dt = pd.to_datetime(out['sale_dt'])
    eff_dt_dt = sale_dt_dt.where(change_mask & ~unknown_mask)

    # forward-filliing the eff_dt within each patient
    eff_dt_dt_filled = eff_dt_dt.groupby(out['entrp_ptnt_id']).ffill()
    out['eff_dt'] = pd.NaT

    out.loc[unknown_mask, 'eff_dt'] = 'Unknown'

    mask_valid = ~unknown_mask & eff_dt_dt_filled.notna()
    out.loc[mask_valid, 'eff_dt'] = eff_dt_dt_filled[mask_valid].dt.date.values

    # cleaning up
    out.drop(columns=['prev_recent_status', 'prev_store_nbr'], inplace=True)

    return out


out = effective_date(out)
out.head(15)


  out.loc[unknown_mask, 'eff_dt'] = 'Unknown'


Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,recent_status,prev_purchase_date,days_since_prev_purchase,transition_dt,eff_dt
0,1001,,2022-12-07 09:00:00,Unknown,,,,2022-12-07,2022-12-07
1,1001,110.0,2025-12-07 09:00:00,Active,Recently New,2022-12-07 09:00:00,1096.0,2025-12-07,2025-12-07
2,1001,110.0,2025-12-12 00:00:00,Active,Active Continuing,,,2025-12-12,2025-12-12
3,1002,,2022-05-26 10:00:00,Unknown,,,,2022-05-26,2022-05-26
4,1002,120.0,2025-05-26 10:00:00,Active,Recently New,2022-05-26 10:00:00,1096.0,2025-05-26,2025-05-26
5,1002,120.0,2025-09-14 00:00:00,Inactive,Inactive Continuing,,,2025-09-14,2025-09-14
6,1002,120.0,2025-11-22 11:00:00,Active,Active Continuing,2025-05-26 10:00:00,180.0,2025-11-22,2025-11-22
7,1002,120.0,2025-12-12 00:00:00,Active,Active Continuing,,,,2025-11-22
8,1003,,2020-06-26 09:00:00,Unknown,,,,2020-06-26,2020-06-26
9,1003,130.0,2023-06-26 09:00:00,Active,Recently New,2020-06-26 09:00:00,1095.0,2023-06-26,2023-06-26


In [13]:
def previous_store_nbr(out):
    # Ensure eff_dt and sale_dt are proper datetime formats
    out['sale_dt'] = pd.to_datetime(out['sale_dt'], errors='coerce')

    out['eff_dt_dt'] = pd.to_datetime(out['eff_dt'], errors='coerce')



    def get_prev_store(out):
        group = out.copy()

        # identify rows where eff_dt exists (as datetime)
        for i in group.index:
            current_eff = group.loc[i, 'eff_dt_dt']

            if pd.isna(current_eff):  
                group.loc[i, 'prev_store_nbr'] = "NULL"
                continue

            # previous purchases strictly before eff_dt
            prev_rows = group[group['sale_dt'] < current_eff]

            if prev_rows.empty:
                group.loc[i, 'prev_store_nbr'] = "NULL"
                continue

            last_store = prev_rows.iloc[-1]['store_nbr']

            if pd.isna(last_store) or last_store == "Unknown":
                group.loc[i, 'prev_store_nbr'] = "NULL"
            else:
                group.loc[i, 'prev_store_nbr'] = last_store

        return group

    out = out.groupby('entrp_ptnt_id', group_keys=False).apply(get_prev_store)

    out.drop(columns=['eff_dt_dt'], inplace=True)

    return out

out = previous_store_nbr(out)
out.head(30)


  out = out.groupby('entrp_ptnt_id', group_keys=False).apply(get_prev_store)


Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,recent_status,prev_purchase_date,days_since_prev_purchase,transition_dt,eff_dt,prev_store_nbr
0,1001,,2022-12-07 09:00:00,Unknown,,,,2022-12-07,2022-12-07,
1,1001,110.0,2025-12-07 09:00:00,Active,Recently New,2022-12-07 09:00:00,1096.0,2025-12-07,2025-12-07,
2,1001,110.0,2025-12-12 00:00:00,Active,Active Continuing,,,2025-12-12,2025-12-12,110.0
3,1002,,2022-05-26 10:00:00,Unknown,,,,2022-05-26,2022-05-26,
4,1002,120.0,2025-05-26 10:00:00,Active,Recently New,2022-05-26 10:00:00,1096.0,2025-05-26,2025-05-26,
5,1002,120.0,2025-09-14 00:00:00,Inactive,Inactive Continuing,,,2025-09-14,2025-09-14,120.0
6,1002,120.0,2025-11-22 11:00:00,Active,Active Continuing,2025-05-26 10:00:00,180.0,2025-11-22,2025-11-22,120.0
7,1002,120.0,2025-12-12 00:00:00,Active,Active Continuing,,,,2025-11-22,120.0
8,1003,,2020-06-26 09:00:00,Unknown,,,,2020-06-26,2020-06-26,
9,1003,130.0,2023-06-26 09:00:00,Active,Recently New,2020-06-26 09:00:00,1095.0,2023-06-26,2023-06-26,


In [14]:
# Keep only the last purchase per day per patient
out['sale_date'] = out['sale_dt'].dt.date
idx_last_per_day = out.groupby(['entrp_ptnt_id', 'sale_date'], sort=False)['sale_dt'].idxmax()
df_last_of_day = out.loc[idx_last_per_day].sort_values(['entrp_ptnt_id', 'sale_dt']).reset_index(drop=True)

# compare current and previous row store number, recent_status, and status
df_last_of_day['prev_store_nbr_shift'] = df_last_of_day.groupby('entrp_ptnt_id')['store_nbr'].shift(1)
df_last_of_day['prev_recent_status_shift'] = df_last_of_day.groupby('entrp_ptnt_id')['recent_status'].shift(1)
df_last_of_day['prev_status_shift'] = df_last_of_day.groupby('entrp_ptnt_id')['status'].shift(1)

# keep row if first for patient OR recent_status changed OR status changed OR store_nbr changed
first_row_mask = df_last_of_day.groupby('entrp_ptnt_id').cumcount() == 0

recent_status_changed_mask = (df_last_of_day['recent_status'] != df_last_of_day['prev_recent_status_shift'])
status_changed_mask = (df_last_of_day['status'] != df_last_of_day['prev_status_shift'])

# compare store numbers as objects to handle NaNs/Int64 consistently
store_changed_mask = (df_last_of_day['store_nbr'].astype(object) != df_last_of_day['prev_store_nbr_shift'].astype(object))

keep_mask = first_row_mask | recent_status_changed_mask | status_changed_mask | store_changed_mask

df_kept = df_last_of_day[keep_mask].copy()

# normalize eff_dt to date if present
if 'eff_dt' in df_kept.columns:
    df_kept['eff_dt'] = pd.to_datetime(df_kept['eff_dt'], errors='coerce').dt.date

# clean prev_store_nbr if present
if 'prev_store_nbr' in df_kept.columns:
    df_kept['prev_store_nbr'] = df_kept['prev_store_nbr'].replace("Unknown", pd.NA)
    df_kept['prev_store_nbr'] = pd.to_numeric(df_kept['prev_store_nbr'], errors='coerce').astype('Int64')

# detect and remove dummy initial rows (first row + status == Unknown + recent_status is missing)
df_kept_copy = df_kept.copy()
is_first_row = df_kept_copy.groupby('entrp_ptnt_id').cumcount() == 0
is_dummy = (
    is_first_row &
    (df_kept_copy['status'] == 'Unknown') &
    (df_kept_copy['recent_status'].isna())
)

df_kept_filtered = df_kept_copy[~is_dummy].reset_index(drop=True)

print(f"Rows before removing dummy rows: {len(df_kept_copy)}")
print(f"Dummy rows removed: {is_dummy.sum()}")
print(f"Rows after removing dummy rows: {len(df_kept_filtered)}")

output = df_kept_filtered[['entrp_ptnt_id', 'eff_dt', 'status', 'recent_status', 'transition_dt', 'prev_store_nbr']].copy()
output.head(50)


Rows before removing dummy rows: 49
Dummy rows removed: 12
Rows after removing dummy rows: 37


Unnamed: 0,entrp_ptnt_id,eff_dt,status,recent_status,transition_dt,prev_store_nbr
0,1001,2025-12-07,Active,Recently New,2025-12-07,
1,1001,2025-12-12,Active,Active Continuing,2025-12-12,110.0
2,1002,2025-05-26,Active,Recently New,2025-05-26,
3,1002,2025-09-14,Inactive,Inactive Continuing,2025-09-14,120.0
4,1002,2025-11-22,Active,Active Continuing,2025-11-22,120.0
5,1003,2023-06-26,Active,Recently New,2023-06-26,
6,1003,2023-10-15,Inactive,Inactive Continuing,2023-10-15,130.0
7,1003,2023-10-15,Lapsed,Inactive Continuing,,130.0
8,1003,2025-11-12,Active,Recently Reactivated,2025-11-12,130.0
9,1003,2025-12-12,Active,Active Continuing,2025-12-12,130.0


In [15]:
# Save output dataframe to Excel
output_path = '../data/processed/dim_patient_status3.xlsx'
output.to_excel(output_path, index=False)
print(f"Output saved to: {output_path}")
print(f"Total rows saved: {len(output)}")
print(f"Columns saved: {list(output.columns)}")

Output saved to: ../data/processed/dim_patient_status3.xlsx
Total rows saved: 37
Columns saved: ['entrp_ptnt_id', 'eff_dt', 'status', 'recent_status', 'transition_dt', 'prev_store_nbr']


In [16]:
# Save output dataframe to Excel
output_path = '../data/processed/processed_table3.xlsx'
out.to_excel(output_path, index=False)
print(f"Output saved to: {output_path}")
print(f"Total rows saved: {len(out)}")
print(f"Columns saved: {list(out.columns)}")

Output saved to: ../data/processed/processed_table3.xlsx
Total rows saved: 55
Columns saved: ['entrp_ptnt_id', 'store_nbr', 'sale_dt', 'status', 'recent_status', 'prev_purchase_date', 'days_since_prev_purchase', 'transition_dt', 'eff_dt', 'prev_store_nbr', 'sale_date']
