# User Guide Tutorial 02: Preprocessing › Imputation

This tutorial shows how to use TemporAI `preprocessing.imputation` plugins.

*Skip the below cell if you are not on Google Colab / already have TemporAI installed:*

In [None]:
%pip install temporai

# Or from the repo, for the latest version:
# %pip install git+https://github.com/vanderschaarlab/temporai.git

## All `preprocessing.imputation` plugins

To see all the relevant plugins:

In [1]:
from tempor import plugin_loader

plugin_loader.list()["preprocessing"]["imputation"]

{'static': ['static_tabular_imputer'],
 'temporal': ['ffill', 'ts_tabular_imputer', 'bfill']}

## Using a static data imputation plugin

In [2]:
from tempor.data.datasources import SineDataSource
from tempor import plugin_loader

dataset = SineDataSource(with_missing=True, random_state=42).load()
print(dataset)

model = plugin_loader.get("preprocessing.imputation.static.static_tabular_imputer", static_imputer="mean")
print(model)

2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 | Iteration imputation: select_model_by_column: True, select_model_by_iteration: True


OneOffPredictionDataset(
    time_series=TimeSeriesSamples([100, *, 5]),
    static=StaticSamples([100, 4]),
    predictive=OneOffPredictionTaskData(targets=StaticSamples([100, 1]))
)
StaticTabularImputer(
    name='static_tabular_imputer',
    category='preprocessing.imputation.static',
    plugin_type='method',
    params={
        'imputer': 'ice',
        'random_state': 0,
        'imputer_params': {'random_state': 0}
    }
)


In [3]:
# Note missingness in static data.

print("Missing value count:", dataset.static.dataframe().isnull().sum().sum())  # type: ignore

dataset.static

Missing value count: 40


Unnamed: 0_level_0,0,1,2,3
sample_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.374540,0.950714,0.731994,0.598658
1,0.156019,0.155995,0.058084,0.866176
2,0.601115,0.708073,0.020584,0.969910
3,0.832443,,0.181825,0.183405
4,0.304242,0.524756,0.431945,0.291229
...,...,...,...,...
95,,0.696737,0.628943,
96,0.735071,0.803481,0.282035,
97,0.750615,0.806835,0.990505,0.412618
98,0.372018,0.776413,0.340804,0.930757


In [4]:
# Note no more missingness in static data.

dataset = model.fit_transform(dataset)  # Or call fit() then transform().

print("Missing value count:", dataset.static.dataframe().isnull().sum().sum())  # type: ignore

dataset.static

2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |   > HyperImpute using inner optimization
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |   > Imputation iter 0
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |      >>> Column 0 <-- score -0.23109777030617995 <-- Model linear_regression
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |      >>> Column 1 <-- score -0.22005514968324613 <-- Model linear_regression
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |   > Imputation iter 1
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |      >>> Column 0 <-- score -0.23109777030617995 <-- Model linear_regression
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |      >>> Column 1 <-- score -0.21670750510884584 <-- Model linear_regression
2023-10-10 13:02:04 | INFO     | hyperimpute.logger:log_and_print:65 |   > Imputation iter 2
2023-10-10 13:02:

Missing value count: 0


Unnamed: 0_level_0,0,1,2,3
sample_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.374540,0.950714,0.731994,0.598658
1,0.156019,0.155995,0.058084,0.866176
2,0.601115,0.708073,0.020584,0.969910
3,0.832443,0.450438,0.181825,0.183405
4,0.304242,0.524756,0.431945,0.291229
...,...,...,...,...
95,0.498806,0.696737,0.628943,0.509994
96,0.735071,0.803481,0.282035,0.503886
97,0.750615,0.806835,0.990505,0.412618
98,0.372018,0.776413,0.340804,0.930757


## Using a temporal data imputation plugin

In [5]:
from tempor.data.datasources import SineDataSource
from tempor import plugin_loader

dataset = SineDataSource(with_missing=True, random_state=42).load()
print(dataset)

model = plugin_loader.get("preprocessing.imputation.temporal.bfill")
print(model)

OneOffPredictionDataset(
    time_series=TimeSeriesSamples([100, *, 5]),
    static=StaticSamples([100, 4]),
    predictive=OneOffPredictionTaskData(targets=StaticSamples([100, 1]))
)
BFillImputer(
    name='bfill',
    category='preprocessing.imputation.temporal',
    plugin_type='method',
    params={}
)


In [6]:
# Note missingness in temporal data.

print("Missing value count:", dataset.time_series.dataframe().isnull().sum().sum())

dataset.time_series

Missing value count: 500


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4
sample_idx,time_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,-0.955338,0.016053,-0.995752,0.948138,0.738158
0,1,-0.896718,0.717189,-0.497625,0.962001,0.968258
0,2,-0.346466,0.999920,0.423104,0.639780,0.972469
0,3,0.393737,0.699299,0.984517,0.094046,0.749807
0,4,0.918072,-0.009290,,,
...,...,...,...,...,...,...
99,5,0.904284,-0.939985,0.994099,-0.984349,0.688521
99,6,0.990911,-0.518593,0.908681,-0.801263,0.813486
99,7,0.757745,0.131791,,-0.110629,0.908965
99,8,,0.723981,0.476023,0.650082,0.971498


In [7]:
# Note no more missingness in temporal data.

dataset = model.fit_transform(dataset)  # Or call fit() then transform().

print("Missing value count:", dataset.time_series.dataframe().isnull().sum().sum())

dataset.time_series

Missing value count: 0


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4
sample_idx,time_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,-0.955338,0.016053,-0.995752,0.948138,0.738158
0,1,-0.896718,0.717189,-0.497625,0.962001,0.968258
0,2,-0.346466,0.999920,0.423104,0.639780,0.972469
0,3,0.393737,0.699299,0.984517,0.094046,0.749807
0,4,0.918072,-0.009290,-0.167662,-0.893854,-0.127538
...,...,...,...,...,...,...
99,5,0.904284,-0.939985,0.994099,-0.984349,0.688521
99,6,0.990911,-0.518593,0.908681,-0.801263,0.813486
99,7,0.757745,0.131791,0.476023,-0.110629,0.908965
99,8,-0.288052,0.723981,0.476023,0.650082,0.971498


## 🎉 Congratulations!

Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the movement towards *Machine learning and AI for Medicine*, you can do so in the following ways!



### ⭐ Star [TemporAI](https://github.com/vanderschaarlab/temporai) on GitHub

- The easiest way to help our community is by just starring the repos! This helps raise awareness of the tools we're building.



### Check out other projects from [vanderschaarlab](https://github.com/vanderschaarlab)
- 📝 [HyperImpute](https://github.com/vanderschaarlab/hyperimpute)
- 📊 [AutoPrognosis](https://github.com/vanderschaarlab/autoprognosis)
- 🤖 [SynthCity](https://github.com/vanderschaarlab/synthcity)
 