[![Test In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vanderschaarlab/temporai/blob/main/tutorials/data/tutorial05_other_data_formats.ipynb)

# Data Tutorial 05: Other Data Formats

This tutorial shows additional data formats supported by TemporAI.

> ⚠️ This feature is experimental and may not yet work as expected.

## Data formats

You can view the supported data formats by running:

In [None]:
from tempor import plugin_loader
from tempor.data import samples_experimental  # Load experimental.

import rich.pretty

dataformat_plugins = plugin_loader.list("dataformat")
rich.pretty.pprint(dataformat_plugins)

## `Dask` data format

`Dask` is a Python library for parallel computing. We provide an interface to
[`Dask` dataframes](https://docs.dask.org/en/stable/dataframe.html), which supports parallel computation.

Below example shows how to load the data samples from `Dask` dataframes.

In [None]:
# Static samples example.

import pandas as pd
import numpy as np
import dask.dataframe as dd

from tempor.data import samples_experimental

categories = ["A", "B", "C"]
np.random.seed(12345)
size = 10
df_s = pd.DataFrame(
    {
        "sample_idx": [f"sample_{x}" for x in range(1, size + 1)],
        "cat_feat_1": pd.Categorical(np.random.choice(categories, size=size)),
        "cat_feat_2": pd.Categorical(np.random.choice(categories, size=size)),
        "num_feat_1": np.random.uniform(0, 10, size=size),
        "num_feat_2": np.random.uniform(20, 30, size=size),
    }
)
df_s.set_index("sample_idx", drop=True, inplace=True)

# Create a dask dataframe:
ddf_s = dd.from_pandas(df_s, npartitions=2)  # type: ignore

# Initialize the static samples object:
samples_experimental.StaticSamplesDask(ddf_s)  # type: ignore

2023-12-07 20:29:54 | INFO     | tempor.data.samples_experimental:_validate:69 | Validation not yet implemented for Dask data format. Data format consistency is not guaranteed.


Unnamed: 0_level_0,cat_feat_1,cat_feat_2,num_feat_1,num_feat_2
sample_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sample_1,C,B,0.267897,29.308157
sample_10,C,B,8.062348,27.949706
sample_2,B,B,2.915024,23.640296
sample_3,B,C,3.98744,26.909479
sample_4,B,B,8.072887,21.293146
sample_5,A,C,6.270943,28.326864
sample_6,B,B,9.079249,23.183537
sample_7,C,C,5.563973,27.372023
sample_8,C,A,8.399193,25.967696
sample_9,B,C,0.50488,23.637068


In [None]:
# Time series samples example.

df_t = pd.DataFrame(
    {
        "sample_idx": ["a", "a", "a", "a", "b", "b", "c"],
        "time_idx": [1, 2, 3, 4, 2, 4, 9],
        "feat_1": [11, 12, 13, 14, 21, 22, 31],
        "feat_2": [1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 3.1],
    }
)
df_t.set_index(keys=["sample_idx", "time_idx"], drop=True, inplace=True)

# Create a dask dataframe:
ddf_t = samples_experimental.multiindex_df_to_compatible_ddf(df_t, npartitions=2)

samples_experimental.TimeSeriesSamplesDask(ddf_t)  # type: ignore

2023-12-07 20:30:30 | INFO     | tempor.data.samples_experimental:_validate:223 | Validation not yet implemented for Dask data format. Data format consistency is not guaranteed.


Unnamed: 0,Unnamed: 1,feat_1,feat_2
a,1,11.0,1.1
a,2,12.0,1.2
a,3,13.0,1.3
a,4,14.0,1.4
b,2,21.0,2.1
b,4,22.0,2.2
c,9,31.0,3.1


In [None]:
# Event samples example.

df_e = pd.DataFrame(
    {
        "sample_idx": [f"sample_{x}" for x in range(1, 3 + 1)],
        "feat_1": [(5, True), (6, False), (3, True)],
        "feat_2": [(1, False), (8, False), (8, True)],
        "feat_3": [
            (pd.to_datetime("2000-01-02"), False),
            (pd.to_datetime("2000-01-03"), True),
            (pd.to_datetime("2000-01-01"), True),
        ],
    },
)
df_e.set_index("sample_idx", drop=True, inplace=True)

# Create a dask dataframe:
ddf_e = dd.from_pandas(df_e, npartitions=2)  # type: ignore

# Initialize the event samples object:
samples_experimental.EventSamplesDask(ddf_e)  # type: ignore

2023-12-07 20:31:53 | INFO     | tempor.data.samples_experimental:_validate:434 | Validation not yet implemented for Dask data format. Data format consistency is not guaranteed.


Unnamed: 0_level_0,feat_1,feat_2,feat_3
sample_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sample_1,"(5, True)","(1, False)","(2000-01-02 00:00:00, False)"
sample_2,"(6, False)","(8, False)","(2000-01-03 00:00:00, True)"
sample_3,"(3, True)","(8, True)","(2000-01-01 00:00:00, True)"
