In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from synthcity.utils.serialization import save_to_file, load_from_file

In [2]:
base_path = Path("/home/bcebere/Documents/datasets/ward")

train_static = pd.read_csv(base_path / "ward_static_train_data.csv.gz")
train_temporal = pd.read_csv(base_path / "ward_temporal_train_data_eav.csv.gz")

test_static = pd.read_csv(base_path / "ward_static_test_data.csv.gz")
test_temporal = pd.read_csv(base_path / "ward_temporal_test_data_eav.csv.gz")

In [3]:
import numpy as np
import pandas as pd

def process_temporal(in_static_data, in_temporal_data):
    all_temporal_data = []
    horizons = []
    
    for uid in in_static_data["id"].unique():
        local_temporal_idx = in_temporal_data["id"] == uid
        local_temporal = in_temporal_data[local_temporal_idx]
        columns = sorted(local_temporal["variable"].unique())
        times = sorted(local_temporal["time"].unique())

        temporal_data = pd.DataFrame([], columns = columns)


        for horizon in times:
            local_temporal_horizon_idx = local_temporal["time"] == horizon
            local_temporal_horizon = local_temporal[local_temporal_horizon_idx]

            horizon_data = pd.DataFrame(-1 * np.ones((1, len(columns))), columns = columns)

            proc = local_temporal_horizon[["variable", "value"]]
            proc.index = local_temporal_horizon["variable"]
            proc = proc.drop(columns = ["variable"])
            proc = proc.T.reset_index(drop = True)
            print(len(proc.columns), len(columns))
            horizon_data[proc.columns] = proc

            temporal_data = pd.concat([temporal_data, horizon_data], ignore_index = True)
        temporal_data.index = times

        for col in columns:
            if col not in temporal_data:
                temporal_data[col] = -1
        horizons.append(temporal_data["time"])
        all_temporal_data.append(temporal_data[columns])
        
    assert len(all_temporal_data) == len(in_static_data)

    return all_temporal_data, horizons

def eav_to_wide(df):
    """Transform EAV format to WIDE format.
    
    Args:
        - df: EAV format dataframe
        
    Returns:
        - df_wide: WIDE format dataframe.    
    """
    # Original data needs the following four column name in order.
    col_names = list(df.columns)
    assert col_names[0] == "id"
    assert col_names[1] == "time"
    assert col_names[2] == "variable"
    assert col_names[3] == "value"

    # Convert EAV format to WIDE format
    df_wide = pd.pivot_table(df, index=["id", "time"], columns="variable", values="value").reset_index(level=[0, 1])
    return df_wide


train_temporal_wide = eav_to_wide(train_temporal)
test_temporal_wide = eav_to_wide(test_temporal)

In [4]:
train_temporal_wide[train_temporal_wide["id"] == 1]

variable,id,time,Best Motor Response,Best Verbal Response,CHLORIDE,CREATINEINE,DBP,Eye Opening,GLUCLOSE,Glasgow Coma Scale Score,...,POTASSIUM,Pulse,Respiratory Rate,SBP,SODIUM,SpO2,TOTAL CO2,Temperature,UREA NITROGEN,WHITE BLOOD CELL COUNT
0,1,0.0,5.0,5.0,99.0,0.6,107.00,3.0,133.0,13.0,...,3.8,78.0,12.00,174.0,136.0,100.00,21.0,98.6,8.0,19.75
1,1,2.0,5.0,5.0,,,96.50,3.0,,13.0,...,,85.5,17.00,156.5,,100.00,,98.1,,
2,1,3.0,5.0,5.0,,,104.00,3.0,,13.0,...,,79.0,18.00,169.0,,100.00,,98.1,,
3,1,4.0,5.0,5.0,,,115.00,3.0,,13.0,...,,73.0,18.00,177.0,,100.00,,98.1,,
4,1,6.0,5.0,5.0,,,102.00,3.0,,13.0,...,,76.0,18.00,162.0,,100.00,,98.1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,1,239.0,6.0,5.0,,,65.00,4.0,,15.0,...,,89.0,16.00,110.0,,99.00,,99.5,,
76,1,240.0,,,97.0,0.5,,,103.0,,...,3.7,,,,135.0,,24.5,,6.0,14.99
77,1,241.0,6.0,5.0,,,80.00,4.0,,15.0,...,,89.0,17.00,127.0,,99.00,,97.5,,
78,1,247.0,6.0,5.0,,,68.25,4.0,,15.0,...,,88.5,16.75,113.5,,97.25,,97.7,,


In [5]:
def prepare_temporal(temporal_wide):
    temporal = []
    horizons = []
    for k, v in temporal_wide.groupby("id"):
        h = v["time"].values.tolist()
        local_data = v.drop(columns = ["id", "time"])
        local_data.index = h
        horizons.append(h)
        temporal.append(local_data.fillna(0))
    return temporal, horizons

train_temporal, train_horizons = prepare_temporal(train_temporal_wide)
test_temporal, test_horizons = prepare_temporal(test_temporal_wide)

train_outcome = train_static["icu_admission"]
test_outcome = test_static["icu_admission"]
train_static = train_static.drop(columns = ["id", "icu_admission"]).fillna(0)
test_static = test_static.drop(columns = ["id", "icu_admission"]).fillna(0)

In [6]:
assert len(train_temporal) == len(train_static)
assert len(test_temporal) == len(test_static)

In [7]:
from synthcity.plugins.core.dataloader import (
     TimeSeriesDataLoader,
)

dataloader_train = TimeSeriesDataLoader(
    temporal_data=train_temporal,
    temporal_horizons=train_horizons,
    static_data=train_static,
    outcome = train_outcome.to_frame(),
)


dataloader_test = TimeSeriesDataLoader(
    temporal_data=test_temporal,
    temporal_horizons=test_horizons,
    static_data=test_static,
    outcome = test_outcome.to_frame(),
)



In [8]:
from synthcity.plugins.core.models.ts_model import TimeSeriesModel

model = TimeSeriesModel(
        task_type = "classification",  # regression, classification
         n_static_units_in = train_static.shape[0],
         n_temporal_units_in = train_temporal[0].shape[-1],
         n_temporal_window = max([len(t) for t in train_temporal]),
         output_shape = [2],
         mode = "RNN",
)

train_static_eval, train_temporal_eval, train_horizons_eval, train_outcome_eval = dataloader_train.unpack(as_numpy = True)

In [9]:
model.fit(train_static_eval, train_temporal_eval, train_horizons_eval, train_outcome_eval)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (12x151 and 5002x100)

In [11]:
for t in train_temporal_eval:
    print(t.shape)

(80, 38)
(22, 38)
(38, 38)
(48, 38)
(180, 38)
(23, 38)
(34, 38)
(22, 38)
(38, 38)
(29, 38)
(41, 38)
(218, 38)
(56, 38)
(37, 38)
(88, 38)
(155, 38)
(58, 38)
(138, 38)
(7, 38)
(23, 38)
(17, 38)
(17, 38)
(34, 38)
(17, 38)
(16, 38)
(150, 38)
(38, 38)
(20, 38)
(9, 38)
(32, 38)
(117, 38)
(38, 38)
(28, 38)
(50, 38)
(16, 38)
(83, 38)
(5, 38)
(5, 38)
(61, 38)
(18, 38)
(138, 38)
(45, 38)
(85, 38)
(24, 38)
(25, 38)
(138, 38)
(20, 38)
(91, 38)
(55, 38)
(45, 38)
(64, 38)
(40, 38)
(78, 38)
(29, 38)
(18, 38)
(11, 38)
(40, 38)
(10, 38)
(86, 38)
(39, 38)
(65, 38)
(25, 38)
(40, 38)
(61, 38)
(27, 38)
(28, 38)
(166, 38)
(70, 38)
(13, 38)
(89, 38)
(110, 38)
(73, 38)
(54, 38)
(37, 38)
(35, 38)
(33, 38)
(34, 38)
(107, 38)
(152, 38)
(15, 38)
(82, 38)
(16, 38)
(313, 38)
(28, 38)
(16, 38)
(113, 38)
(178, 38)
(245, 38)
(139, 38)
(23, 38)
(31, 38)
(27, 38)
(32, 38)
(61, 38)
(156, 38)
(14, 38)
(93, 38)
(148, 38)
(21, 38)
(16, 38)
(66, 38)
(54, 38)
(90, 38)
(90, 38)
(33, 38)
(70, 38)
(140, 38)
(15, 38)
(28, 38)
(8,

In [10]:
for idx, val in enumerate(train_temporal_eval):
    assert train_temporal_eval[idx].shape[0] == len(train_horizons_eval[idx]), len(train_horizons_eval[idx])

In [None]:
train_temporal[0]