# 07: test new data preprocessing (v2.0.0)

In [1]:
# set working directory so we can load data correctly (all functions at this point are not put into modules yet)
import os
os.chdir('..')
os.getcwd()

'/Users/964505/CT/ct_research'

Test History Encoder

In [2]:
from ct.data.consolidating import HistoryEncoder

In [3]:
import pandas as pd

raw_data = pd.DataFrame({
    "patient_id": [1, 1, 1, 2],
    "domain_ids": [
        [1, 2],     # session 1 (patient 1)
        [1],                # session 2 (patient 1)
        [2],             # session 3 (patient 1, next week)
        [1]                 # session 1 (patient 2)
    ],
    "domain_scores": [
        [0.8, 0.6],
        [0.9],
        [0.7],
        [0.4]
    ],
    # minutes since epoch (roughly one week gap between 2nd and 3rd)
    "start_time_min": [
        "2018-04-13 17:37:55",
        "2018-04-13 18:37:55",     # same week as first
        "2018-04-21 17:37:55",     # next week
        "2018-04-13 17:37:55"
    ]
})

In [4]:
encoder = HistoryEncoder(raw_data)
weekly_df = encoder.transform()

In [5]:
print("Shape:", weekly_df.shape)
print("Index:", weekly_df.index.names)
print("Columns:", list(weekly_df.columns))

Shape: (3, 7)
Index: ['patient_id', 'week_number']
Columns: ['domain_1_freq', 'domain_2_freq', 'week_start_ts', 'domain_1_avg', 'domain_2_avg', 'domain_1_inv', 'domain_2_inv']


In [6]:
print(weekly_df.reset_index())

   patient_id  week_number  domain_1_freq  domain_2_freq  \
0           1            0              2              1   
1           1            1              0              1   
2           2            0              1              0   

              week_start_ts  domain_1_avg  domain_2_avg  domain_1_inv  \
0 2018-04-13 17:37:55+00:00          0.85           0.6          0.15   
1 2018-04-20 17:37:55+00:00          0.85           0.7          0.15   
2 2018-04-13 17:37:55+00:00          0.40           0.0          0.60   

   domain_2_inv  
0           0.4  
1           0.3  
2           0.0  


In [7]:
weekly_df.loc[(1, 0)]

domain_1_freq                            2
domain_2_freq                            1
week_start_ts    2018-04-13 17:37:55+00:00
domain_1_avg                          0.85
domain_2_avg                           0.6
domain_1_inv                          0.15
domain_2_inv                           0.4
Name: (1, 0), dtype: object

In [8]:
raw = pd.DataFrame({
    "patient_id": [101, 101, 202],
    "domain_ids": [
        [1, 2],      # patient 101, session in week 0
        [1],         # patient 101, session after a 3-week offset (=> week 3)
        [2],         # patient 202, session in week 0
    ],
    "domain_scores": [
        [0.8, 0.6],
        [0.9],
        [0.4],
    ],
    "start_time_min": [
        "2018-04-13 17:37:55",  # baseline => week 0 for patient 101
        "2018-05-04 17:37:55",  # +21 days => floor(21/7)=3 => week 3
        "2018-04-20 10:00:00",  # baseline => week 0 for patient 202
    ]
})

weekly = HistoryEncoder(raw).transform()

print("Weekly shape:", weekly.shape)
print("Index names:", weekly.index.names)
print("Index values:", list(weekly.index))
print("\nWeekly snippet:\n", weekly.reset_index().sort_values(["patient_id","week_number"]).head(10))

Weekly shape: (5, 7)
Index names: ['patient_id', 'week_number']
Index values: [(101, 0), (101, 1), (101, 2), (101, 3), (202, 0)]

Weekly snippet:
    patient_id  week_number  domain_1_freq  domain_2_freq  \
0         101            0              1              1   
1         101            1              0              0   
2         101            2              0              0   
3         101            3              1              0   
4         202            0              0              1   

              week_start_ts  domain_1_avg  domain_2_avg  domain_1_inv  \
0 2018-04-13 17:37:55+00:00           0.8           0.6           0.2   
1 2018-04-20 17:37:55+00:00           0.8           0.6           0.2   
2 2018-04-27 17:37:55+00:00           0.8           0.6           0.2   
3 2018-05-04 17:37:55+00:00           0.9           0.6           0.1   
4 2018-04-20 10:00:00+00:00           0.0           0.4           0.0   

   domain_2_inv  
0           0.4  
1           0.4  

Filtering

In [9]:
from ct.data.filtering import filter_users_by_usage

In [10]:
min_sessions_per_week = 0
min_weeks = 0
require_consecutive = True

In [11]:
raw_df = pd.DataFrame({
    "patient_id": [
        # --- User A (meets criteria) ---
        1, 1,        # week 0
        1, 1,        # week 1
        1, 1,        # week 2
        

        # --- User B (fails criteria) ---
        2,           # week 0
        2,           # week 2 (gap at week 1)
    ],
    "domain_ids": [
        [1], [2],    # user 1, week 0 → 2 sessions
        [1], [2],    # user 1, week 1 → 2 sessions
        [1], [2],    # user 1, week 2 → 2 sessions

        [1],         # user 2, week 0 → 1 session
        [1],         # user 2, week 2 → 1 session
    ],
    "domain_scores": [
        [0.8], [0.7],
        [0.9], [0.6],
        [0.85], [0.75],

        [0.5],
        [0.6],
    ],
    "start_time_min": [
        # User 1 — three consecutive weeks
        "2018-01-01 10:00:00",
        "2018-01-01 11:00:00",
        "2018-01-08 10:00:00",
        "2018-01-08 11:00:00",
        "2018-01-15 10:00:00",
        "2018-01-15 11:00:00",

        # User 2 — week 0, then week 2 (gap)
        "2018-01-01 09:00:00",
        "2018-01-15 09:00:00",
    ],
})

In [12]:
weekly_df = HistoryEncoder(raw_df).transform()
print(weekly_df.reset_index())

   patient_id  week_number  domain_1_freq  domain_2_freq  \
0           1            0              1              1   
1           1            1              1              1   
2           1            2              1              1   
3           2            0              1              0   
4           2            1              0              0   
5           2            2              1              0   

              week_start_ts  domain_1_avg  domain_2_avg  domain_1_inv  \
0 2018-01-01 10:00:00+00:00          0.80          0.70          0.20   
1 2018-01-08 10:00:00+00:00          0.90          0.60          0.10   
2 2018-01-15 10:00:00+00:00          0.85          0.75          0.15   
3 2018-01-01 09:00:00+00:00          0.50          0.00          0.50   
4 2018-01-08 09:00:00+00:00          0.50          0.00          0.50   
5 2018-01-15 09:00:00+00:00          0.60          0.00          0.40   

   domain_2_inv  
0          0.30  
1          0.40  
2          0.

In [13]:
filtered = filter_users_by_usage(
    weekly_df,
    min_sessions_per_week=min_sessions_per_week,
    min_weeks=min_weeks,
    require_consecutive=require_consecutive,
)

print(filtered.reset_index())

   patient_id  week_number  domain_1_freq  domain_2_freq  \
0           1            0              1              1   
1           1            1              1              1   
2           1            2              1              1   
3           2            0              1              0   
4           2            1              0              0   
5           2            2              1              0   

              week_start_ts  domain_1_avg  domain_2_avg  domain_1_inv  \
0 2018-01-01 10:00:00+00:00          0.80          0.70          0.20   
1 2018-01-08 10:00:00+00:00          0.90          0.60          0.10   
2 2018-01-15 10:00:00+00:00          0.85          0.75          0.15   
3 2018-01-01 09:00:00+00:00          0.50          0.00          0.50   
4 2018-01-08 09:00:00+00:00          0.50          0.00          0.50   
5 2018-01-15 09:00:00+00:00          0.60          0.00          0.40   

   domain_2_inv  
0          0.30  
1          0.40  
2          0.

test gru encoder

In [14]:
from ct.predictor.GRU_MLP import TemporalEncoderGRU
from ct.data.encoding import encode_weekly_df_for_mlp

In [15]:
d_in = len([c for c in filtered.columns if pd.api.types.is_numeric_dtype(filtered[c])])
encoder = TemporalEncoderGRU(d_in=d_in, d_hidden=128)

out = encode_weekly_df_for_mlp(encoder, filtered, device="cpu")
z = out["embeddings"]           # (B, 128)
patient_ids = out["patient_ids"]

print(z.shape, len(patient_ids))

torch.Size([2, 128]) 2


# Unit Tests
### history encoder

In [16]:
# Happy case
raw_data = pd.DataFrame({
    "patient_id": [1, 1, 1, 2, 2],
    "domain_ids": [
        [1, 2],     # session 1 (patient 1)
        [1],                # session 2 (patient 1)
        [2],             # session 3 (patient 1, next week)
        [1],                # session 1 (patient 2)
        [1]
    ],
    "domain_scores": [
        [0.8, 0.6],
        [0.9],
        [0.7],
        [0.4],
        [0.5]
    ],
    # minutes since epoch (roughly one week gap between 2nd and 3rd)
    "start_time_min": [
        "2018-04-13 17:37:55",
        "2018-04-13 18:37:55",     # same week as first
        "2018-04-21 17:37:55",     # next week
        "2018-04-13 17:37:55",
        "2018-04-27 17:37:55" # week after next week for patient 2
    ]
})

encoder = HistoryEncoder(raw_data)
weekly_df = encoder.transform()

In [17]:
assert weekly_df.shape == (5, 7), "weekly_df has incorrect shape"
assert weekly_df.loc[(1, 0), "domain_1_avg"].round(2) == 0.85, "domain_1_avg is incorrect"
assert weekly_df.loc[(1, 0), "domain_1_inv"].round(2) == 0.15, "domain_1_inv is incorrect"
assert weekly_df.loc[(1, 0), "domain_1_freq"] == 2, "domain_1_freq is incorrect"

In [18]:
weekly_df

Unnamed: 0_level_0,Unnamed: 1_level_0,domain_1_freq,domain_2_freq,week_start_ts,domain_1_avg,domain_2_avg,domain_1_inv,domain_2_inv
patient_id,week_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,2,1,2018-04-13 17:37:55+00:00,0.85,0.6,0.15,0.4
1,1,0,1,2018-04-20 17:37:55+00:00,0.85,0.7,0.15,0.3
2,0,1,0,2018-04-13 17:37:55+00:00,0.4,0.0,0.6,0.0
2,1,0,0,2018-04-20 17:37:55+00:00,0.4,0.0,0.6,0.0
2,2,1,0,2018-04-27 17:37:55+00:00,0.5,0.0,0.5,0.0


In [None]:
assert weekly_df.loc[(2, 2), "domain_1_avg"].round(2) == 0.5, "domain_1_avg for patient 2 week 3 is incorrect"
assert weekly_df.loc[(2, 1), "domain_2_freq"] == weekly_df.loc[(2, 1), "domain_2_inv"]\
        and weekly_df.loc[(2, 1), "domain_2_freq"] == weekly_df.loc[(2, 1), "domain_2_avg"]\
        and weekly_df.loc[(2, 1), "domain_2_freq"] == 0.0, "domain_2 things for patient 2 week 2 is incorrect"

KeyError: 'domain_2-freq'