### Preprocessed Data Sanity Check
## Objective
- Check to see that preprocessed data is as expected as compared to v1 results (20250602)

## Set up

### Set up working directory

In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/964505/CT/ct_research/v2'

### Import Packages

In [2]:
import src.data.data_io as data_io
import src.data.preprocessing as preprocessing
import src.utils.config_loading as config_loading

## Load data and config

In [3]:
df = data_io.read_preprocessed_session_file('data/raw/predictor_data_20250529.csv')

In [4]:
config = config_loading.load_yaml_config("config/preprocess/20250602.yaml")
# extract information from the configuration file
input_path = config['source']['directory'] + config['source']['filename']
output_path = config['target']['directory'] + config['target']['filename']

usage_frequency_threshold = config['filter_params']['usage_frequency']
usage_days_threshold = config['filter_params']['usage_days']

eps_days = config['filter_params']['eps_days']
min_samples = config['filter_params']['min_samples']

## Run through preprocess pipeline

### drop duplicate ids

In [5]:
raw_data_dropped = preprocessing.drop_duplicates(df, based_on=['id'])

In [6]:
raw_data_dropped.nunique()

id               7008614
patient_id         68780
start_time       6693130
task_type_id         173
task_level            12
domain_ids            27
domain_scores     191137
dtype: int64

In [7]:
raw_data_dropped

Unnamed: 0,id,patient_id,start_time,task_type_id,task_level,domain_ids,domain_scores
0,11018577,2171,2018-04-13 17:37:55,4,1,610,"0.2950,0.8140"
5,11042076,2171,2018-04-15 23:48:00,4,1,610,"0.3050,0.8000"
10,11057448,2171,2018-04-16 21:55:03,4,1,610,"0.3160,0.7930"
15,11069816,2171,2018-04-17 17:54:54,4,1,610,"0.3160,0.8070"
20,11742188,2171,2018-06-05 17:35:50,4,1,610,"0.2740,0.8360"
...,...,...,...,...,...,...,...
80506826,41286101,584095,2023-05-22 17:29:53,255,3,12,0.6790
80506830,41465313,584095,2023-05-25 19:56:38,255,3,12,0.6670
80506834,41530297,584095,2023-05-30 03:44:54,255,3,12,0.6670
80506838,41585933,584095,2023-05-31 00:33:20,255,3,12,0.6550


### filter datetime outlier

In [8]:
raw_data_filtered = raw_data_dropped.groupby("patient_id")[raw_data_dropped.columns].apply(preprocessing.filter_datetime_outliers, eps_days, min_samples).reset_index(drop=True)

In [9]:
raw_data_filtered.nunique()

id               6608495
patient_id         29459
start_time       6327908
task_type_id         173
task_level            12
domain_ids            27
domain_scores     186958
dtype: int64

In [10]:
raw_data_filtered

Unnamed: 0,id,patient_id,start_time,task_type_id,task_level,domain_ids,domain_scores
0,11018577,2171,2018-04-13 17:37:55,4,1,610,"0.2950,0.8140"
1,11042076,2171,2018-04-15 23:48:00,4,1,610,"0.3050,0.8000"
2,11057448,2171,2018-04-16 21:55:03,4,1,610,"0.3160,0.7930"
3,11069816,2171,2018-04-17 17:54:54,4,1,610,"0.3160,0.8070"
4,11742188,2171,2018-06-05 17:35:50,4,1,610,"0.2740,0.8360"
...,...,...,...,...,...,...,...
6608490,41286101,584095,2023-05-22 17:29:53,255,3,12,0.6790
6608491,41465313,584095,2023-05-25 19:56:38,255,3,12,0.6670
6608492,41530297,584095,2023-05-30 03:44:54,255,3,12,0.6670
6608493,41585933,584095,2023-05-31 00:33:20,255,3,12,0.6550


### filter usage frequency and length

In [11]:
usage_df = preprocessing.find_usage_frequency(raw_data_filtered)

In [17]:
usage_df

Unnamed: 0,patient_id,unique_days,usage_time,usage_freq
0,2171,483,1442,0.334951
1,2281,47,1944,0.024177
2,2517,187,699,0.267525
3,4032,30,534,0.056180
4,4073,5,10,0.500000
...,...,...,...,...
29454,560491,14,41,0.341463
29455,565981,50,57,0.877193
29456,568669,134,268,0.500000
29457,575513,14,46,0.304348


In [12]:
usage_df_filtered = usage_df[(usage_df['usage_freq'] > usage_frequency_threshold) & (usage_df['usage_time'] > usage_days_threshold)]

In [13]:
patient_ids = usage_df_filtered['patient_id'].tolist()

In [None]:
filtered_data = raw_data_dropped[raw_data_dropped['patient_id'].isin(patient_ids)]

In [None]:
filtered_data.nunique()

id               4961763
patient_id          2456
start_time       4806462
task_type_id         169
task_level            12
domain_ids            27
domain_scores     156532
dtype: int64

## Check data stats

In [None]:
df.nunique()

id               7008614
patient_id         68780
start_time       6693130
task_type_id         173
task_level            12
domain_ids            27
domain_scores     191137
dtype: int64