In [4]:
import pandas as pd
from pandas.tseries.offsets import DateOffset
import numpy as np

In [5]:
df = pd.read_excel('../data/raw/sale_table.xlsx')

PermissionError: [Errno 13] Permission denied: '../data/raw/sale_table.xlsx'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   entrp_ptnt_id  37 non-null     int64 
 1   store_nbr      37 non-null     int64 
 2   sale_dt        37 non-null     object
dtypes: int64(2), object(1)
memory usage: 1020.0+ bytes


In [None]:
# Sort data by patient ID (entrp_ptnt_id)
df_sorted = df.sort_values(by=['entrp_ptnt_id', 'sale_dt'], ascending=[True, True])
print(f"Data sorted by 'entrp_ptnt_id'")
print(f"Original data shape: {df.shape}")
print(f"Sorted data shape: {df_sorted.shape}")

print(f"\nFirst 10 rows of sorted data:")
print(df_sorted.head(10))

print(f"\nLast 10 rows of sorted data:")
print(df_sorted.tail(10))

# Update the original dataframe with sorted data
op_df = df_sorted
#print(f"\nOriginal dataframe 'df' has been updated with sorted data")

Data sorted by 'entrp_ptnt_id'
Original data shape: (37, 3)
Sorted data shape: (37, 3)

First 10 rows of sorted data:
    entrp_ptnt_id  store_nbr     sale_dt
16           1001        101  2020-01-01
20           1001        101  2020-04-20
23           1001        102  2020-04-21
34           1001        200  2020-04-21
26           1001        101  2020-12-31
2            1001        103  2021-01-01
18           1001        101  2022-12-31
6            1001        104  2023-01-01
10           1002        201  2018-06-01
14           1002        202  2019-07-06

Last 10 rows of sorted data:
    entrp_ptnt_id  store_nbr     sale_dt
21           1005        502  2023-08-29
36           1005        502  2023-09-28
3            1006        601  2023-12-01
29           1007        701  2019-05-05
17           1007        701  2019-08-23
13           1007        702  2020-05-04
27           1007        703  2022-05-04
4            1008        802  2021-07-01
25           1008        801  20

In [None]:
# get first sale per patient (use min to ensure earliest first sale)
first_sales = op_df.groupby('entrp_ptnt_id', as_index=False)['sale_dt'].min()
first_sales['sale_dt'] = pd.to_datetime(first_sales['sale_dt'])  # normalize to datetime

# build new_rows: sale_dt = 3 years prior to the first sale
new_rows = first_sales.rename(columns={'sale_dt': 'orig_first_sale'})
new_rows['sale_dt'] = new_rows['orig_first_sale'] - pd.DateOffset(years=3)
new_rows = new_rows.drop(columns=['orig_first_sale'])

# Reindex to have the same columns as op_df (this will create missing cols with NaN)
new_rows = new_rows.reindex(columns=op_df.columns)

# Ensure sale_dt dtype matches op_df['sale_dt'] dtype
if pd.api.types.is_datetime64_any_dtype(op_df['sale_dt']):
    new_rows['sale_dt'] = pd.to_datetime(new_rows['sale_dt'])
else:
    # if op_df.sale_dt is plain date objects, keep date
    new_rows['sale_dt'] = pd.to_datetime(new_rows['sale_dt']).dt.date

# set required columns to defaults:
# - status = 'Unknown'
# - prev_store_nbr and recent_status should be NULL (pd.NA) per spec (prev_store_nbr = NULL before first purchase)
new_rows['status'] = 'Unknown'
new_rows['prev_store_nbr'] = pd.NA
new_rows['recent_status'] = pd.NA

# Concatenate and re-sort so the new rows come before the first actual sale
op_df = pd.concat([op_df, new_rows], ignore_index=True, sort=False)
op_df = op_df.sort_values(['entrp_ptnt_id', 'sale_dt']).reset_index(drop=True)

op_df['store_nbr'] = pd.to_numeric(op_df['store_nbr'], errors='coerce').astype('Int64')
op_df.head()



Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,prev_store_nbr,recent_status
0,1001,,2017-01-01,Unknown,,
1,1001,101.0,2020-01-01,,,
2,1001,101.0,2020-04-20,,,
3,1001,102.0,2020-04-21,,,
4,1001,200.0,2020-04-21,,,


In [None]:
#retrieve data of specific patient
#print(op_df[op_df['entrp_ptnt_id'] == 1001])

op_df['sale_dt'] = pd.to_datetime(op_df['sale_dt'])


#new column for first purchase
#op_df['first_purchase'] = op_df.groupby('entrp_ptnt_id')['sale_dt'].transform('min')

#new column with days since previous purchase
op_df['prev_purchase_date'] = op_df.groupby('entrp_ptnt_id')['sale_dt'].shift(1)
op_df['days_since_prev_purchase'] = (op_df['sale_dt'] - op_df['prev_purchase_date']).dt.days.astype('Int64')

print()
print(op_df.head())


   entrp_ptnt_id  store_nbr    sale_dt   status prev_store_nbr recent_status  \
0           1001       <NA> 2017-01-01  Unknown            NaN           NaN   
1           1001        101 2020-01-01      NaN            NaN           NaN   
2           1001        101 2020-04-20      NaN            NaN           NaN   
3           1001        102 2020-04-21      NaN            NaN           NaN   
4           1001        200 2020-04-21      NaN            NaN           NaN   

  prev_purchase_date  days_since_prev_purchase  
0                NaT                      <NA>  
1         2017-01-01                      1095  
2         2020-01-01                       110  
3         2020-04-20                         1  
4         2020-04-21                         0  


In [None]:
def assign_status(days):
    if pd.isna(days):
        return "Unknown"               # dummy row
    
    # first purchase
    if days >= 1095:
        return "Active"

    # other purchases
    if days <= 110:
        return "Active"
    elif days <= 365:
        return "Inactive"
    elif days < 1095:
        return "Lapsed"
    else:
        return "Lost"


op_df['status'] = op_df['days_since_prev_purchase'].apply(assign_status)

op_df.head(9)


Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,prev_store_nbr,recent_status,prev_purchase_date,days_since_prev_purchase
0,1001,,2017-01-01,Unknown,,,NaT,
1,1001,101.0,2020-01-01,Active,,,2017-01-01,1095.0
2,1001,101.0,2020-04-20,Active,,,2020-01-01,110.0
3,1001,102.0,2020-04-21,Active,,,2020-04-20,1.0
4,1001,200.0,2020-04-21,Active,,,2020-04-21,0.0
5,1001,101.0,2020-12-31,Inactive,,,2020-04-21,254.0
6,1001,103.0,2021-01-01,Active,,,2020-12-31,1.0
7,1001,101.0,2022-12-31,Lapsed,,,2021-01-01,729.0
8,1001,104.0,2023-01-01,Active,,,2022-12-31,1.0


In [None]:
#recent status column creation and logic implementation
def assign_recent_status(days):

    if pd.isna(days):
        return "Unknown"
    # ensure numeric (could be float)
    try:
        d = int(days)
    except Exception:
        return "Unknown"
    # now apply rules
    if d >= 1095:                 #greater than 1095 days
        return "Recently New"
    if 366 <= d < 1095:
        return "Recently Reactivated"
    if d <= 110:                 # covers 0..110
        return "Active Continuing"
    if 111 <= d <= 365:
        return "Inactive Continuing"
    # fallback (shouldn't be reached)
    return "Unknown"

op_df['recent_status'] = op_df['days_since_prev_purchase'].apply(assign_recent_status)

#op_df.head(47)

In [None]:
def transition_date():
    # compute previous recent_status within each patient
    op_df['prev_recent_status'] = op_df.groupby('entrp_ptnt_id')['recent_status'].shift(1)

    # If recent_status is Unknown
    unknwn = op_df['recent_status'] == 'Unknown'
    op_df.loc[unknwn, 'transition_dt'] = "Unknown"

    #other rows
    change = (op_df['recent_status'] != op_df['prev_recent_status']) & (~unknwn)
    op_df.loc[change, 'transition_dt'] = op_df.loc[change, 'sale_dt'].dt.date

    # If no change then NULL
    no_change_mask = (op_df['recent_status'] == op_df['prev_recent_status']) & (~unknwn)
    op_df.loc[no_change_mask, 'transition_dt'] = "NULL"

    # drop helper column
    op_df.drop(columns=['prev_recent_status'], inplace=True)

    return op_df


op_df = transition_date()
op_df.head(9)
    

Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,prev_store_nbr,recent_status,prev_purchase_date,days_since_prev_purchase,transition_dt
0,1001,,2017-01-01,Unknown,,Unknown,NaT,,Unknown
1,1001,101.0,2020-01-01,Active,,Recently New,2017-01-01,1095.0,2020-01-01
2,1001,101.0,2020-04-20,Active,,Active Continuing,2020-01-01,110.0,2020-04-20
3,1001,102.0,2020-04-21,Active,,Active Continuing,2020-04-20,1.0,
4,1001,200.0,2020-04-21,Active,,Active Continuing,2020-04-21,0.0,
5,1001,101.0,2020-12-31,Inactive,,Inactive Continuing,2020-04-21,254.0,2020-12-31
6,1001,103.0,2021-01-01,Active,,Active Continuing,2020-12-31,1.0,2021-01-01
7,1001,101.0,2022-12-31,Lapsed,,Recently Reactivated,2021-01-01,729.0,2022-12-31
8,1001,104.0,2023-01-01,Active,,Active Continuing,2022-12-31,1.0,2023-01-01


In [None]:
def effective_date(op_df):

    op_df['prev_recent_status'] = op_df.groupby('entrp_ptnt_id')['recent_status'].shift(1)
    op_df['prev_store_nbr'] = op_df.groupby('entrp_ptnt_id')['store_nbr'].shift(1)

    rs_curr = op_df['recent_status'].fillna('__NA__')
    rs_prev = op_df['prev_recent_status'].fillna('__NA__')
    

    sn_curr = op_df['store_nbr'].astype('string').fillna('__NA__')
    sn_prev = op_df['prev_store_nbr'].astype('string').fillna('__NA__')

    unknown_mask = op_df['recent_status'] == 'Unknown'

    change_mask = (
        (rs_curr != rs_prev) |
        (sn_curr != sn_prev) |
        op_df['prev_recent_status'].isna()    # ensures first row per patient is marked 
    )

    sale_dt_dt = pd.to_datetime(op_df['sale_dt'])
    eff_dt_dt = sale_dt_dt.where(change_mask & ~unknown_mask)

    # forward-filliing the eff_dt within each patient
    eff_dt_dt_filled = eff_dt_dt.groupby(op_df['entrp_ptnt_id']).ffill()
    op_df['eff_dt'] = pd.NaT

    op_df.loc[unknown_mask, 'eff_dt'] = 'Unknown'

    mask_valid = ~unknown_mask & eff_dt_dt_filled.notna()
    op_df.loc[mask_valid, 'eff_dt'] = eff_dt_dt_filled[mask_valid].dt.date.values

    # cleaning up
    op_df.drop(columns=['prev_recent_status', 'prev_store_nbr'], inplace=True)

    return op_df


op_df = effective_date(op_df)
#op_df.head(9)


   entrp_ptnt_id  store_nbr    sale_dt   status      recent_status  \
0           1001       <NA> 2017-01-01  Unknown            Unknown   
1           1001        101 2020-01-01   Active       Recently New   
2           1001        101 2020-04-20   Active  Active Continuing   
3           1001        102 2020-04-21   Active  Active Continuing   
4           1001        200 2020-04-21   Active  Active Continuing   

  prev_purchase_date  days_since_prev_purchase transition_dt      eff_dt  \
0                NaT                      <NA>       Unknown     Unknown   
1         2017-01-01                      1095    2020-01-01  2020-01-01   
2         2020-01-01                       110    2020-04-20  2020-04-20   
3         2020-04-20                         1          NULL  2020-04-21   
4         2020-04-21                         0          NULL  2020-04-21   

    sale_date prev_recent_status  prev_store_nbr  
0  2017-01-01                NaN            <NA>  
1  2020-01-01       

  op_df.loc[unknown_mask, 'eff_dt'] = 'Unknown'


In [None]:
def previous_store_nbr(op_df):
    # Ensure eff_dt and sale_dt are proper datetime formats
    op_df['sale_dt'] = pd.to_datetime(op_df['sale_dt'], errors='coerce')

    op_df['eff_dt_dt'] = pd.to_datetime(op_df['eff_dt'], errors='coerce')



    def get_prev_store(op_df):
        group = op_df.copy()

        # identify rows where eff_dt exists (as datetime)
        for i in group.index:
            current_eff = group.loc[i, 'eff_dt_dt']

            if pd.isna(current_eff):  
                group.loc[i, 'prev_store_nbr'] = "NULL"
                continue

            # previous purchases strictly before eff_dt
            prev_rows = group[group['sale_dt'] < current_eff]

            if prev_rows.empty:
                group.loc[i, 'prev_store_nbr'] = "NULL"
                continue

            last_store = prev_rows.iloc[-1]['store_nbr']

            if pd.isna(last_store) or last_store == "Unknown":
                group.loc[i, 'prev_store_nbr'] = "NULL"
            else:
                group.loc[i, 'prev_store_nbr'] = last_store

        return group

    op_df = op_df.groupby('entrp_ptnt_id', group_keys=False).apply(get_prev_store)

    op_df.drop(columns=['eff_dt_dt'], inplace=True)

    return op_df

op_df = previous_store_nbr(op_df)
#op_df.head(9)


  op_df['eff_dt_dt'] = pd.to_datetime(op_df['eff_dt'], errors='coerce')
  op_df = op_df.groupby('entrp_ptnt_id', group_keys=False).apply(get_prev_store)


Unnamed: 0,entrp_ptnt_id,store_nbr,sale_dt,status,recent_status,prev_purchase_date,days_since_prev_purchase,transition_dt,eff_dt,prev_store_nbr
0,1001,,2017-01-01,Unknown,Unknown,NaT,,Unknown,Unknown,
1,1001,101.0,2020-01-01,Active,Recently New,2017-01-01,1095.0,2020-01-01,2020-01-01,
2,1001,101.0,2020-04-20,Active,Active Continuing,2020-01-01,110.0,2020-04-20,2020-04-20,101.0
3,1001,102.0,2020-04-21,Active,Active Continuing,2020-04-20,1.0,,2020-04-21,101.0
4,1001,200.0,2020-04-21,Active,Active Continuing,2020-04-21,0.0,,2020-04-21,101.0
5,1001,101.0,2020-12-31,Inactive,Inactive Continuing,2020-04-21,254.0,2020-12-31,2020-12-31,200.0
6,1001,103.0,2021-01-01,Active,Active Continuing,2020-12-31,1.0,2021-01-01,2021-01-01,101.0
7,1001,101.0,2022-12-31,Lapsed,Recently Reactivated,2021-01-01,729.0,2022-12-31,2022-12-31,103.0
8,1001,104.0,2023-01-01,Active,Active Continuing,2022-12-31,1.0,2023-01-01,2023-01-01,101.0


In [None]:
op_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   entrp_ptnt_id             45 non-null     int64         
 1   store_nbr                 37 non-null     Int64         
 2   sale_dt                   45 non-null     datetime64[ns]
 3   status                    45 non-null     object        
 4   recent_status             45 non-null     object        
 5   prev_purchase_date        37 non-null     datetime64[ns]
 6   days_since_prev_purchase  37 non-null     Int64         
 7   transition_dt             45 non-null     object        
 8   eff_dt                    45 non-null     object        
 9   prev_store_nbr            45 non-null     object        
dtypes: Int64(2), datetime64[ns](2), int64(1), object(5)
memory usage: 4.0+ KB


In [None]:
#retrieve data of specific patient
print(op_df[op_df['entrp_ptnt_id'] == 1004])


NameError: name 'op_df' is not defined

In [None]:

#op_df['sale_dt'] = pd.to_datetime(op_df['sale_dt'], errors='coerce')
#op_df = op_df.sort_values(['entrp_ptnt_id', 'sale_dt']).reset_index(drop=True)

# Keep only the last purchase per day per patient
op_df['sale_date'] = op_df['sale_dt'].dt.date
idx_last_per_day = op_df.groupby(['entrp_ptnt_id', 'sale_date'], sort=False)['sale_dt'].idxmax()
df_last_of_day = op_df.loc[idx_last_per_day].sort_values(['entrp_ptnt_id', 'sale_dt']).reset_index(drop=True)

# compare current and previous row store number
df_last_of_day['prev_store_nbr_shift'] = df_last_of_day.groupby('entrp_ptnt_id')['store_nbr'].shift(1)
df_last_of_day['prev_recent_status_shift'] = df_last_of_day.groupby('entrp_ptnt_id')['recent_status'].shift(1)

# keep row if first for patient OR recent_status changed OR store_nbr changed
first_row_mask = df_last_of_day.groupby('entrp_ptnt_id').cumcount() == 0
status_changed_mask = (df_last_of_day['recent_status'] != df_last_of_day['prev_recent_status_shift'])
store_changed_mask = (df_last_of_day['store_nbr'].astype(object) != df_last_of_day['prev_store_nbr_shift'].astype(object))


keep_mask = first_row_mask | status_changed_mask | store_changed_mask

df_kept = df_last_of_day[keep_mask].copy()


if 'eff_dt' in df_kept.columns:
    df_kept['eff_dt'] = pd.to_datetime(df_kept['eff_dt'], errors='coerce').dt.date


if 'prev_store_nbr' in df_kept.columns:
    df_kept['prev_store_nbr'] = df_kept['prev_store_nbr'].replace("Unknown", pd.NA)
    df_kept['prev_store_nbr'] = pd.to_numeric(df_kept['prev_store_nbr'], errors='coerce').astype('Int64')


df_kept_copy = df_kept.copy()

is_first_row = df_kept_copy.groupby('entrp_ptnt_id').cumcount() == 0
is_dummy = is_first_row & (df_kept_copy['status'] == 'Unknown') & (df_kept_copy['recent_status'] == 'Unknown')

# Filtered out dummy rows
df_kept_filtered = df_kept_copy[~is_dummy].reset_index(drop=True)

print(f"Rows before removing dummy rows: {len(df_kept_copy)}")
print(f"Dummy rows removed: {is_dummy.sum()}")
print(f"Rows after removing dummy rows: {len(df_kept_filtered)}")

output = df_kept_filtered[['entrp_ptnt_id', 'eff_dt', 'status', 'recent_status', 'transition_dt', 'prev_store_nbr']].copy()
output.head(50)


Rows before removing dummy rows: 35
Dummy rows removed: 8
Rows after removing dummy rows: 27


  df_kept['eff_dt'] = pd.to_datetime(df_kept['eff_dt'], errors='coerce').dt.date


Unnamed: 0,entrp_ptnt_id,eff_dt,status,recent_status,transition_dt,prev_store_nbr
0,1001,2020-01-01,Active,Recently New,2020-01-01,
1,1001,2020-04-20,Active,Active Continuing,2020-04-20,101.0
2,1001,2020-04-21,Active,Active Continuing,,101.0
3,1001,2020-12-31,Inactive,Inactive Continuing,2020-12-31,200.0
4,1001,2021-01-01,Active,Active Continuing,2021-01-01,101.0
5,1001,2022-12-31,Lapsed,Recently Reactivated,2022-12-31,103.0
6,1001,2023-01-01,Active,Active Continuing,2023-01-01,101.0
7,1002,2018-06-01,Active,Recently New,2018-06-01,
8,1002,2019-07-06,Lapsed,Recently Reactivated,2019-07-06,201.0
9,1002,2019-08-25,Active,Active Continuing,2019-08-25,202.0


In [None]:
output.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   entrp_ptnt_id   27 non-null     int64 
 1   eff_dt          27 non-null     object
 2   status          27 non-null     object
 3   recent_status   27 non-null     object
 4   transition_dt   27 non-null     object
 5   prev_store_nbr  19 non-null     Int64 
dtypes: Int64(1), int64(1), object(4)
memory usage: 1.4+ KB


In [None]:
# Save output dataframe to Excel
output_path = '../data/processed/dim_patient_status.xlsx'
output.to_excel(output_path, index=False)
print(f"Output saved to: {output_path}")
print(f"Total rows saved: {len(output)}")
print(f"Columns saved: {list(output.columns)}")

Output saved to: ../data/processed/dim_patient_status.xlsx
Total rows saved: 27
Columns saved: ['entrp_ptnt_id', 'eff_dt', 'status', 'recent_status', 'transition_dt', 'prev_store_nbr']


In [None]:
# Save output dataframe to Excel
output_path = '../data/processed/processed_table.xlsx'
op_df.to_excel(output_path, index=False)
print(f"Output saved to: {output_path}")
print(f"Total rows saved: {len(output)}")
print(f"Columns saved: {list(output.columns)}")

Output saved to: ../data/processed/processed_table.xlsx
Total rows saved: 27
Columns saved: ['entrp_ptnt_id', 'eff_dt', 'status', 'recent_status', 'transition_dt', 'prev_store_nbr']
