In [352]:
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

Read the original data and have it preprocessed

In [353]:
data_path = "./data/df_clean.pkl"
data = pd.read_pickle(data_path)

In [354]:
fecha_inicio_far = data["fecha_inicio_far"]
fecha_diag = data["fecha_diag"]

data.drop(columns=["fecha_inicio_far", "fecha_diag"], inplace=True)

In [355]:
cat_cols = list(data.select_dtypes(include="category").columns)
num_cols = list(data.select_dtypes(include='number').columns)

sort_columns = ['ppio_activo',
'sexo',
'familia_eii',
'loc_ec',
'comp_ec',
'exten_cu',
'tipo_eii',
'preianal',
'ada_previo',
'ifx_previo',
'uste_previo',
'vedo_previo',
'tofa_previo',
'certol_previo',
'golim_previo',
'cx_previa_eii',
'tabaco',
'meis_espondiloartropatías',
'meis_uveitis',
'meis_eritema_nodoso',
'meis_pioderma_gangrenoso',
'meis_colangitis_esclerosanteprimaria',
'meis_estomatitis_aftosa',
'meis_SdSweet',
'meis_psoriasis',
'meis_hidrosiadenitis',
'meis_vasculitis',
'meis_vitiligo',
'meis_osteoporosis',
'meis_fenom_tromboembolico',
'edad_inicio_far',
'edad_diag',
'cort_12m_previo',
'diabetes',
'asma',
'vih',
'migrana',
'calprotectina',
'lab_velocidad_sed_globular',
'lab_sodio',
'lab_aspartato_transaminasa',
'lab_glucosa',
'lab_proteinas_totales',
'lab_urea',
'lab_potasio',
'lab_alanina_transaminasa',
'lab_acido_folico',
'lab_vitamina_b12',
'lab_ferritina',
'lab_creatinina',
'lab_proteina_c_reactiva',
'lab_trigliceridos',
'lab_colesterol',
'lab_volumen_plaquetario_medio',
'lab_plaquetas_recuento',
'lab_basofilos_porcentaje',
'lab_eosinofilos_porcentaje',
'lab_linfocitos_porcentaje',
'lab_neutrofilos_porcentaje',
'lab_basofilos_recuento',
'lab_eosinofilos_recuento',
'lab_monocitos_recuento',
'lab_linfocitos_recuento',
'lab_neutrofilos_recuento',
'lab_leucocitos_recuento',
'lab_dispersion_hematies_volumen',
'lab_hemoglobina_corpuscular_media',
'lab_volumen_corpuscular_medio',
'lab_hematocrito',
'lab_hematies_recuento',
'lab_bilirrubina_total',
'lab_fosfatasa_alcalina',
'lab_alfa_1_glicoproteina_acida',
'res_clin_26', # Target
'res_clin_52', # Target
'rem_clin_52', # Target
'res_clin_104', # Target
'rem_clin_104',] # Target

processed_data = data[sort_columns].copy()

In [356]:
train_data = processed_data.copy()

Se imputan los NA's

In [357]:
train_numeric = train_data[num_cols]

# Crear un imputador iterativo con un modelo de regresión lineal
imputer = IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0)

# Imputar valores faltantes
imputed_data = imputer.fit_transform(train_numeric)

df_num_imputed = pd.DataFrame(imputed_data, columns=train_numeric.columns)



In [358]:
df_num_imputed.index = train_data.index

In [359]:
train_data[num_cols] = df_num_imputed[num_cols].round(3)
train_data

Unnamed: 0_level_0,ppio_activo,sexo,familia_eii,loc_ec,comp_ec,exten_cu,tipo_eii,preianal,ada_previo,ifx_previo,...,lab_hematocrito,lab_hematies_recuento,lab_bilirrubina_total,lab_fosfatasa_alcalina,lab_alfa_1_glicoproteina_acida,res_clin_26,res_clin_52,rem_clin_52,res_clin_104,rem_clin_104
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HVM1,USTEKINUMAB,H,0,2,1,0,EC,1,1,1,...,40.331,4.664,0.515,73.155,70.065,1,1,1,1,1
HVM2,VEDOLIZUMAB,M,0,0,0,3,CU,0,1,1,...,0.300,3.350,0.570,68.000,-271.879,1,1,0,0,0
HVM3,USTEKINUMAB,H,0,1,3,0,EC,1,0,1,...,40.377,4.658,0.505,73.475,71.545,0,1,1,1,1
HVM4,VEDOLIZUMAB,H,2,2,2,0,EC,1,1,1,...,39.478,4.641,0.507,69.827,67.415,1,1,0,,
HVM5,USTEKINUMAB,H,2,2,2,0,EC,1,1,1,...,39.547,4.632,0.492,70.308,69.635,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HVM224,USTEKINUMAB,H,0,0,0,2,CU,0,0,1,...,38.000,4.040,0.280,100.000,166.300,1,1,1,0,0
HVM225,USTEKINUMAB,H,0,0,0,3,CU,0,1,1,...,39.800,5.270,0.380,89.000,313.800,0,0,0,0,0
HVM226,USTEKINUMAB,H,0,3,3,0,EC,1,1,1,...,45.000,5.150,0.320,61.000,97.500,1,1,0,0,0
HVM227,USTEKINUMAB,H,0,1,1,0,EC,1,1,1,...,44.300,4.800,0.710,80.000,98.000,1,1,1,,


In [360]:
num_rows_with_na = train_data.isna().any(axis=1).sum()
print(f'Número de filas con al menos un valor nulo: {num_rows_with_na}')

# Remove rows with any NA
train_data_clean = train_data.dropna()
print(train_data_clean.shape)

index_nas = train_data[train_data.isna().any(axis=1)].index
fecha_diag = fecha_diag.drop(index_nas)
fecha_inicio_far = fecha_inicio_far.drop(index_nas)

train_data_clean = train_data_clean.reset_index().drop(columns="ID")
fecha_diag = fecha_diag.reset_index().drop(columns="ID")
fecha_inicio_far = fecha_inicio_far.reset_index().drop(columns="ID")

Número de filas con al menos un valor nulo: 46
(182, 78)


Conver dates to timestamp integer

In [361]:
train_data_clean["fecha_diag_timestamp"] = fecha_diag.astype(int) // 10**9
train_data_clean["fecha_inicio_far_timestamp"] = fecha_diag.astype(int) // 10**9

In [362]:
train_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 80 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   ppio_activo                           182 non-null    category
 1   sexo                                  182 non-null    category
 2   familia_eii                           182 non-null    category
 3   loc_ec                                182 non-null    category
 4   comp_ec                               182 non-null    category
 5   exten_cu                              182 non-null    category
 6   tipo_eii                              182 non-null    category
 7   preianal                              182 non-null    category
 8   ada_previo                            182 non-null    category
 9   ifx_previo                            182 non-null    category
 10  uste_previo                           182 non-null    category
 11  vedo_p

Generate synthetic data

In [363]:
print(train_data_clean[cat_cols].dtypes)


ppio_activo                             category
sexo                                    category
familia_eii                             category
loc_ec                                  category
comp_ec                                 category
exten_cu                                category
tipo_eii                                category
preianal                                category
ada_previo                              category
ifx_previo                              category
uste_previo                             category
vedo_previo                             category
tofa_previo                             category
certol_previo                           category
golim_previo                            category
cx_previa_eii                           category
tabaco                                  category
meis_espondiloartropatías               category
meis_uveitis                            category
meis_eritema_nodoso                     category
meis_pioderma_gangre

In [364]:
cat_cols = list(train_data_clean.select_dtypes(include="category").columns)
num_cols = list(train_data_clean.select_dtypes(include='number').columns)

train_data_clean[cat_cols] = train_data_clean[cat_cols].astype(str)


synth = RegularSynthesizer(modelname='fast')
synth.fit(data=train_data_clean, num_cols=num_cols, cat_cols=cat_cols)

Hyperparameter search: 100%|██████████| 8/8 [00:04<00:00,  1.68it/s]


Generate 500 rows

In [365]:
synth_data = synth.sample(1000)
synth_data

Unnamed: 0,ppio_activo,sexo,familia_eii,loc_ec,comp_ec,exten_cu,tipo_eii,preianal,ada_previo,ifx_previo,...,lab_bilirrubina_total,lab_fosfatasa_alcalina,lab_alfa_1_glicoproteina_acida,res_clin_26,res_clin_52,rem_clin_52,res_clin_104,rem_clin_104,fecha_diag_timestamp,fecha_inicio_far_timestamp
0,VEDOLIZUMAB,M,0,0,0,3,CU,0,0,0,...,0.213100,92.988628,200.044479,0,0,0,0,1,1639200521,1638760985
1,VEDOLIZUMAB,M,0,0,0,3,CU,0,0,1,...,0.411969,66.385975,62.947759,0,1,0,1,0,938467017,935997132
2,USTEKINUMAB,M,0,0,0,2,CU,0,1,1,...,0.743418,74.631522,79.462422,1,1,1,1,1,716299462,715087827
3,VEDOLIZUMAB,H,0,0,0,2,CU,0,1,0,...,0.010493,108.100160,4.718868,1,1,1,1,1,1345650586,1346474958
4,VEDOLIZUMAB,M,0,0,0,3,CU,0,1,0,...,0.199082,111.490397,117.877358,1,1,0,1,1,1556366959,1558013081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,USTEKINUMAB,H,0,1,2,0,EC,1,1,1,...,0.097636,84.406909,118.203019,1,1,0,0,0,1459116296,1459767296
996,USTEKINUMAB,M,0,1,2,0,EC,0,0,1,...,0.385416,108.313646,159.650650,1,1,1,1,1,1147481808,1146268328
997,VEDOLIZUMAB,M,0,1,1,0,EC,0,0,1,...,0.386420,90.888795,116.663942,1,1,1,0,0,1154422508,1154167093
998,USTEKINUMAB,M,0,1,2,0,EC,0,0,0,...,0.542376,74.548122,113.437789,1,0,0,1,1,1466192254,1466798441


Set timestamps as dates again

In [366]:
synth_data['fecha_inicio_far'] = pd.to_datetime(synth_data['fecha_inicio_far_timestamp'], unit='s')
synth_data.drop(columns=["fecha_inicio_far_timestamp"], inplace=True)
synth_data['fecha_diag'] = pd.to_datetime(synth_data['fecha_diag_timestamp'], unit='s')
synth_data.drop(columns=["fecha_diag_timestamp"], inplace=True)

In [367]:
# Assuming 'fech_diag' and 'fecha_inicio_far_timestamp' are the date columns
mask = synth_data['fecha_diag'] >= synth_data['fecha_inicio_far']

# Count the number of rows that do not satisfy the condition
num_rows_not_satisfying_condition = (~mask).sum()

# Display the count of rows not satisfying the condition
print(f"Number of rows not satisfying the condition: {num_rows_not_satisfying_condition}")

# Drop rows that do not satisfy the condition
synth_data = synth_data[mask]

Number of rows not satisfying the condition: 494


In [368]:
list_ids =[f"HUVM_{i}" for i in range(1, len(synth_data) + 1)]
synth_data= synth_data.copy()
synth_data.loc[:, 'ID'] = list_ids

synth_data = synth_data[sort_columns].copy()

synth_data.head()

Unnamed: 0,ppio_activo,sexo,familia_eii,loc_ec,comp_ec,exten_cu,tipo_eii,preianal,ada_previo,ifx_previo,...,lab_hematocrito,lab_hematies_recuento,lab_bilirrubina_total,lab_fosfatasa_alcalina,lab_alfa_1_glicoproteina_acida,res_clin_26,res_clin_52,rem_clin_52,res_clin_104,rem_clin_104
0,VEDOLIZUMAB,M,0,0,0,3,CU,0,0,0,...,45.973943,4.542077,0.2131,92.988628,200.044479,0,0,0,0,1
1,VEDOLIZUMAB,M,0,0,0,3,CU,0,0,1,...,43.618085,5.000956,0.411969,66.385975,62.947759,0,1,0,1,0
2,USTEKINUMAB,M,0,0,0,2,CU,0,1,1,...,44.327137,5.118912,0.743418,74.631522,79.462422,1,1,1,1,1
5,VEDOLIZUMAB,H,0,0,0,2,CU,1,1,1,...,38.643741,4.180835,0.374859,109.105071,159.625742,1,1,1,1,1
7,VEDOLIZUMAB,H,0,0,0,2,CU,0,1,0,...,55.185318,5.355321,0.786882,62.261477,193.597199,0,0,1,0,0


save as csv

In [370]:
synth_data.to_csv('data/synthetic_data.csv', index=False)