## Data Overview

Compare summary stats for the overall VenEx data, the subset of data used in modeling, and the subset of data used for train/test split.

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from new_patient_model import extract_data_from_tables_new

## Load data

In [3]:
patient_data = pd.read_csv('../patient_data_venex/ven_responses_052023.txt', sep='\t')
patient_data.index = patient_data.Study_subject_Id

In [4]:
blood_counts = pd.read_excel('../patient_data_venex/Ven_blood_counts_16042023.xlsx', sheet_name='Blood_counts')
bm_blasts = pd.read_excel('../patient_data_venex/Ven_blood_counts_16042023.xlsx', sheet_name='Bone_marrow_blasts')
cycle_days = pd.read_excel('../patient_data_venex/Ven_blood_counts_16042023.xlsx', sheet_name='Cycle_days')

In [5]:
rmse_data_combined = pd.read_csv('rmse_data_combined_table_2025_05_01.csv', index_col=0)

In [6]:
patient_cycle_train_table = pd.read_csv('patient_cycle_train_table.csv', index_col=0)
patient_cycle_test_table = pd.read_csv('patient_cycle_test_table.csv', index_col=0)
patient_cycle_additional_table = pd.read_csv('patient_cycle_additional_table.csv', index_col=0)

In [7]:
patient_data.columns

Index(['Study_subject_Id', 'incl_dg_for_tables', 'no_ven_therapy', 'prev_allo',
       'prev_hypomet_all', 'Gender', 'age_group_scr', 'ecog_scr', 'hb_scr',
       'leuk_scr', 'neut_all_scr', 'trom_scr', 'ld_scr', 'b_blasts_scr',
       'blasts_bm_scr', 'fab', 'del_5', 'del_5q', 'del_7', 'del_7q', 'del_17p',
       'complex', 'monosomal', 'HR_karyotype', 'karyotype_normal', 'NPM1',
       'FLT3_tkd', 'FLT3_itd', 'FLT3_rat', 'CEBPA', 'DNMT3A', 'IDH1', 'IDH2',
       'TET2', 'RUNX1', 'SRSF2', 'TP53', 'ASXL1', 'BCOR', 'CDKN2A', 'CREBBP',
       'CUX1', 'EP300', 'ETV6', 'EZH2', 'GATA2', 'KDM6A', 'NF1', 'RAD21',
       'SETD2', 'STAG2', 'XRSR2', 'BRAF', 'CALR', 'CBL', 'CSF3R', 'GATA1',
       'JAK2', 'KIT', 'KRAS', 'NRAS', 'MPL', 'PDGFRA', 'PHF6', 'PTPN11',
       'SETBP1', 'SF3B1', 'SMC1A', 'SMC3', 'U2AF1', 'WT1', 'c1_response',
       'c2_response', 'c3_response', 'best_response', 'mrd_neg', 'mrd_method',
       'relapse_occurred', 'eot_reason', 'eot_response', 'status',
       'OS_time_ve

In [8]:
patient_data.groupby('no_ven_therapy').agg('count')['Study_subject_Id']

no_ven_therapy
No     92
Yes    12
Name: Study_subject_Id, dtype: int64

## Calculate the number of observations for each patient

In [9]:
all_patient_observation_counts = {}
for patient_id in patient_data.index:
    cycle_info, leuk_table, blast_table = extract_data_from_tables_new(blood_counts,
                                                        bm_blasts, cycle_days, patient_id, use_neut=True)
    obs_counts = {}
    obs_counts['neut'] = len(leuk_table)
    obs_counts['blast'] = len(blast_table)
    obs_counts['cycle'] = len(cycle_info)
    all_patient_observation_counts[patient_id] = obs_counts

In [10]:
all_patient_observation_counts = pd.DataFrame(all_patient_observation_counts).T

## All patient data

Data for all patients

In [11]:
patient_data[['Gender', 'age_group_scr', 'status']].describe()

Unnamed: 0,Gender,age_group_scr,status
count,104,104,104
unique,2,2,2
top,Male,75 years or younger,Exitus
freq,61,71,72


In [12]:
patient_data[['OS_time_ven2_days', 'PFS_time_days']].describe()

Unnamed: 0,OS_time_ven2_days,PFS_time_days
count,104.0,104.0
mean,388.048077,259.278846
std,278.663084,273.556776
min,6.0,1.0
25%,133.0,1.0
50%,371.5,180.0
75%,577.5,406.0
max,1120.0,1120.0


In [13]:
patient_data.groupby('best_response').agg('count')['Study_subject_Id']

best_response
CR             43
CRh             2
CRi            14
MLFS           13
PR/SD/PD/RD    18
Name: Study_subject_Id, dtype: int64

In [14]:
patient_data.groupby('incl_dg_for_tables').agg('count')['Study_subject_Id']

incl_dg_for_tables
De novo / ND AML    48
R/R AML             39
sAML                17
Name: Study_subject_Id, dtype: int64

In [15]:
all_patient_observation_counts.describe()

Unnamed: 0,neut,blast,cycle
count,104.0,104.0,104.0
mean,17.605769,5.451923,8.653846
std,15.616408,3.398686,9.434258
min,1.0,1.0,0.0
25%,6.0,3.0,2.0
50%,12.5,5.0,5.0
75%,23.0,8.0,13.25
max,59.0,14.0,37.0


## All ven/aza data

Data for all patients with some ven/aza treatment.

In [16]:
ven_patients = patient_data[patient_data.no_ven_therapy=='No'].index

In [17]:
print(len(ven_patients))

92


In [18]:
ven_patient_data = patient_data.loc[ven_patients]

In [19]:
ven_patient_data[['Gender', 'age_group_scr', 'status']].describe()

Unnamed: 0,Gender,age_group_scr,status
count,92,92,92
unique,2,2,2
top,Male,75 years or younger,Exitus
freq,53,61,61


In [20]:
ven_patient_data[['OS_time_ven2_days', 'PFS_time_days']].describe()

Unnamed: 0,OS_time_ven2_days,PFS_time_days
count,92.0,92.0
mean,415.565217,292.967391
std,274.854739,273.417985
min,6.0,1.0
25%,181.75,85.0
50%,395.0,232.5
75%,603.75,432.75
max,1120.0,1120.0


In [21]:
ven_patient_data.groupby('best_response').agg('count')['Study_subject_Id']

best_response
CR             43
CRh             2
CRi            14
MLFS           13
PR/SD/PD/RD    18
Name: Study_subject_Id, dtype: int64

In [22]:
ven_patient_data.groupby('incl_dg_for_tables').agg('count')['Study_subject_Id']

incl_dg_for_tables
De novo / ND AML    48
R/R AML             32
sAML                12
Name: Study_subject_Id, dtype: int64

In [23]:
all_patient_observation_counts.loc[ven_patients].describe()

Unnamed: 0,neut,blast,cycle
count,92.0,92.0,92.0
mean,19.695652,6.021739,9.782609
std,15.416374,3.196764,9.464597
min,3.0,1.0,1.0
25%,8.0,4.0,3.0
50%,14.5,5.0,5.5
75%,29.75,8.0,15.25
max,59.0,14.0,37.0


## Modeling data

This includes all patients with enough data for ODE modeling (5 points)

In [24]:
modeling_patients = rmse_data_combined.index.unique()

In [25]:
print(len(modeling_patients))

71


In [26]:
modeling_patient_data = patient_data.loc[modeling_patients]

In [27]:
modeling_patient_data[['Gender', 'age_group_scr', 'status']].describe()

Unnamed: 0,Gender,age_group_scr,status
count,71,71,71
unique,2,2,2
top,Male,75 years or younger,Exitus
freq,40,47,41


In [28]:
modeling_patient_data[['OS_time_ven2_days', 'PFS_time_days']].describe()

Unnamed: 0,OS_time_ven2_days,PFS_time_days
count,71.0,71.0
mean,493.309859,370.239437
std,245.786992,264.784142
min,105.0,1.0
25%,290.5,158.0
50%,466.0,329.0
75%,653.5,539.0
max,1120.0,1120.0


In [29]:
modeling_patient_data.groupby('best_response').agg('count')['Study_subject_Id']

best_response
CR             41
CRh             1
CRi            14
MLFS            9
PR/SD/PD/RD     6
Name: Study_subject_Id, dtype: int64

In [30]:
modeling_patient_data.groupby('incl_dg_for_tables').agg('count')['Study_subject_Id']

incl_dg_for_tables
De novo / ND AML    40
R/R AML             22
sAML                 9
Name: Study_subject_Id, dtype: int64

In [31]:
all_patient_observation_counts.loc[modeling_patients].describe()

Unnamed: 0,neut,blast,cycle
count,71.0,71.0,71.0
mean,23.746479,6.957746,12.183099
std,15.286519,2.915165,9.527869
min,3.0,2.0,2.0
25%,11.0,5.0,4.5
50%,19.0,6.0,9.0
75%,36.0,8.0,19.0
max,59.0,14.0,37.0


## Train/test data

This includes all patients with enough data for the train/test split.

In [32]:
train_test_patients = patient_cycle_train_table.index.unique()

In [33]:
print(len(train_test_patients))

33


In [34]:
train_test_patient_data = patient_data.loc[train_test_patients]

In [35]:
train_test_patient_data[['Gender', 'age_group_scr', 'status']].describe()

Unnamed: 0,Gender,age_group_scr,status
count,33,33,33
unique,2,2,2
top,Male,75 years or younger,Alive
freq,20,22,20


In [36]:
train_test_patient_data[['OS_time_ven2_days', 'PFS_time_days']].describe()

Unnamed: 0,OS_time_ven2_days,PFS_time_days
count,33.0,33.0
mean,614.636364,534.30303
std,223.354793,246.73651
min,249.0,215.0
25%,442.0,344.0
50%,585.0,533.0
75%,702.0,673.0
max,1120.0,1120.0


In [37]:
train_test_patient_data.groupby('best_response').agg('count')['Study_subject_Id']

best_response
CR      24
CRi      8
MLFS     1
Name: Study_subject_Id, dtype: int64

In [38]:
train_test_patient_data.groupby('incl_dg_for_tables').agg('count')['Study_subject_Id']

incl_dg_for_tables
De novo / ND AML    25
R/R AML              6
sAML                 2
Name: Study_subject_Id, dtype: int64

In [39]:
all_patient_observation_counts.loc[train_test_patients].describe()

Unnamed: 0,neut,blast,cycle
count,33.0,33.0,33.0
mean,33.090909,8.424242,18.181818
std,14.725326,2.872611,9.254913
min,11.0,4.0,7.0
25%,19.0,6.0,10.0
50%,33.0,8.0,16.0
75%,45.0,10.0,25.0
max,59.0,14.0,37.0
