### Import Libraries, Mount Storage, Load Data

In [1]:
import json
import pandas as pd
import numpy as np

from collections import defaultdict, Counter

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Cam be 'add', 'onset' or 'hist'
target_var = 'onset'

In [3]:
########## Personal ##########
# !pip install ts2vec
# from ts2vec import TS2Vec
# from google.colab import drive
# drive.mount('/content/drive')
# with open(f'/content/drive/MyDrive/datasets/dataset_{target_var}.json') as f:
#     content = f.read()
#     data = json.loads(content)

########## Enterprise ##########
# !pip install ts2vec
# from ts2vec import TS2Vec
# import gcsfs
# fs = gcsfs.GCSFileSystem()
# with fs.open('gs://modoo-eod/users/datasets/dataset_hist.json') as f:
#     content = f.read()
#     data = json.loads(content)

######### Local ##########
from ts2vec import TS2Vec
with open(f"../datasets/dataset_{target_var}.json") as f:
    content=f.read()
    data=json.loads(content)

### Data Cleaning

* There are null values present in the `static` field ; null values come from missing "Age" and "BMI"

* Replace null values with the respective means

In [4]:
df = pd.DataFrame.from_records(data)

print(len(df), "Measurements")

age_mean = np.mean([i[0] for i in df['static'] if pd.notna(i[0])])
bmi_mean = np.mean([i[1] for i in df['static'] if pd.notna(i[1])])

cleaned_data = []
for _, m in enumerate(data):

    # Remove measurements with empty windows
    if len(m['uc_windows']) == 0 or len(m['fhr_windows']) == 0:
        continue

    # Handle NaN values
    static = m['static'].copy()
    if pd.isna(m['static'][0]):
        static[0] = age_mean
    if pd.isna(m['static'][1]):
        static[1] = bmi_mean

    copy = m.copy()
    copy['static'] = static
    cleaned_data.append(copy)

cleaned_df = pd.DataFrame(cleaned_data)

print(len(cleaned_df), "Cleaned Measurements")

3681 Measurements
3661 Cleaned Measurements


### Train-Test Split (Stratified)

* Stratified splitting of Train-Test groups ; ensures similar gestational age distribution in Train and Test groups

In [5]:
cleaned_df["gest_age_weeks"] = [(i[-1]//7)+1 for i in cleaned_df["static"]]

df_train = cleaned_df.groupby(
    "gest_age_weeks",
    group_keys=False
).apply(lambda x: x.sample(frac=0.8, random_state=42), include_groups=True)

df_test = cleaned_df.drop(df_train.index)

train = df_train.to_dict(orient='records')
test  = df_test.to_dict(orient='records')

print(f"Train : {len(train)} measurements")
print(f"Test  : {len(test)} measurements")

print()
print("Fields:")
for _, k in enumerate(train[0]):
    print(f"{_+1}: {k}")

Train : 2928 measurements
Test  : 733 measurements

Fields:
1: mobile
2: measurement_date
3: static
4: uc_raw
5: fhr_raw
6: uc_windows
7: fhr_windows
8: target
9: gest_age_weeks


  ).apply(lambda x: x.sample(frac=0.8, random_state=42), include_groups=True)


### Pre-Compute TS2Vec Embeddings

* Each measurement (2048 seconds) is converted to a 320-dimension embedding

* Measurements must be passed to TS2Vec in the format `(n_instances x n_timestamps x n_features)`

In [None]:
# Train UC, FHR
train_uc  = np.expand_dims(np.array([i['uc_raw'] for i in train]), 2)
train_fhr = np.expand_dims(np.array([i['fhr_raw'] for i in train]), 2)
# print("Train UC  :", train_uc.shape)
# print("Train FHR :", train_fhr.shape)

# Test UC, FHR
test_uc  = np.expand_dims(np.array([i['uc_raw'] for i in test]), 2)
test_fhr = np.expand_dims(np.array([i['fhr_raw'] for i in test]), 2)
# print("Test UC   :", test_uc.shape)
# print("Test FHR  :", test_fhr.shape)

ts_model = TS2Vec(
    input_dims = 1,
    output_dims = 320,
    device = 0,
    batch_size = 32
)

# Train UC, FHR embeddings
train_uc_embed    = ts_model.encode(train_uc, encoding_window="full_series")
train_fhr_embed   = ts_model.encode(train_fhr, encoding_window="full_series")

# Test UC, FHR embeddings
test_uc_embed     = ts_model.encode(test_uc, encoding_window="full_series")
test_fhr_embed    = ts_model.encode(test_fhr, encoding_window="full_series")

for idx, e in enumerate(train_uc_embed):
    train[idx]['uc_raw'] = e

for idx, e in enumerate(train_fhr_embed):
    train[idx]['fhr_raw'] = e

for idx, e in enumerate(test_uc_embed):
    test[idx]['uc_raw'] = e

for idx, e in enumerate(test_fhr_embed):
    test[idx]['fhr_raw'] = e

### Aggregate Windows

* Each measurement has different number of UC windows and FHR windows

* Each UC window has 20 features whereas each FHR window has 24 features

* Aggregate UC/FHR windows from each measurement since LGBM expects input of consistent size

In [51]:
for i in train:

    uc_w  = np.array([[v for _, v in w.items()] for w in i['uc_windows']], dtype=np.float32)
    fhr_w = np.array([[v for _, v in w.items()] for w in i['fhr_windows']], dtype=np.float32)

    i['uc_windows']  = uc_w.mean(axis=0)
    i['fhr_windows'] = fhr_w.mean(axis=0)

    assert len(i['uc_windows']) == 20
    assert len(i['fhr_windows']) == 24

for i in test:

    uc_w = np.array([[v for _, v in w.items()] for w in i['uc_windows']], dtype=np.float32)
    fhr_w = np.array([[v for _, v in w.items()] for w in i['fhr_windows']], dtype=np.float32)

    i['uc_windows']  = uc_w.mean(axis=0)
    i['fhr_windows'] = fhr_w.mean(axis=0)

    assert len(i['uc_windows']) == 20
    assert len(i['fhr_windows']) == 24

### Build Dataset

* Concatenate features from each measurement together to form a single vector

In [52]:
FEATURE_ORDER = ["uc_raw", "fhr_raw", "fhr_windows", "uc_windows", "static"]
FEATURE_DIMS  = {"uc_raw":320, "fhr_raw":320, "fhr_windows":24, "uc_windows":20, "static":8}

def to_feature_vec(m):

    parts = []

    for k in FEATURE_ORDER:

        v = np.asarray(m[k], dtype=np.float32).ravel()

        assert v.size == FEATURE_DIMS[k]

        parts.append(v)

    x = np.concatenate(parts).astype(np.float32, copy=False)

    return x

def make_xy(measurements):

    X = np.stack([to_feature_vec(m) for m in measurements], axis=0)

    y = np.asarray([m['target'] for m in measurements], dtype=np.float32).ravel()

    groups = [m['gest_age_weeks'] for m in measurements]

    return X, y, groups

X_train, y_train, train_groups  = make_xy(train)
X_test, y_test, test_groups     = make_xy(test)

print(f"Train: {X_train.shape}")
train_groups_count = dict(Counter(sorted(train_groups)))
for k, v in train_groups_count.items():
    print(f"{k}w: {v}")

print()

print(f"Test: {X_test.shape}")
test_groups_count = dict(Counter(sorted(test_groups)))
for k, v in test_groups_count.items():
    print(f"{k}w: {v}")

Train: (3189, 692)
29w: 20
30w: 58
31w: 85
32w: 135
33w: 225
34w: 288
35w: 325
36w: 379
37w: 442
38w: 474
39w: 407
40w: 272
41w: 78
42w: 1

Test: (797, 692)
29w: 5
30w: 14
31w: 21
32w: 34
33w: 56
34w: 72
35w: 81
36w: 95
37w: 110
38w: 119
39w: 102
40w: 68
41w: 20


### Training

* Train LGBM on default parameters (can tune later on)

In [53]:
lgbm = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170994
[LightGBM] [Info] Number of data points in the train set: 3189, number of used features: 679
[LightGBM] [Info] Start training from score 26.056892


### Evaluation

In [54]:
train_pred = lgbm.predict(X_train)
test_pred  = lgbm.predict(X_test)

print("Train MAE :", mean_absolute_error(y_train, train_pred))
print("Test  MAE :", mean_absolute_error(y_test,  test_pred))
print()

test_error_groups = defaultdict(list)
test_abs_error = np.abs(test_pred-y_test)

for g, e in zip(test_groups, test_abs_error):

    test_error_groups[g].append(e)

print("Test MAE by Gestational Age Weeks:")
for k, v in sorted(test_error_groups.items(), key=lambda x: x[0]):

    print(f"{k}w: {np.mean(v):.3f} ({len(v)} measurements)")

Train MAE : 0.017865323029245907
Test  MAE : 2.3567425158123476

Test MAE by Gestational Age Weeks:
29w: 5.032 (5 measurements)
30w: 4.962 (14 measurements)
31w: 3.270 (21 measurements)
32w: 5.211 (34 measurements)
33w: 2.570 (56 measurements)
34w: 2.152 (72 measurements)
35w: 1.966 (81 measurements)
36w: 1.740 (95 measurements)
37w: 1.900 (110 measurements)
38w: 2.496 (119 measurements)
39w: 2.408 (102 measurements)
40w: 1.735 (68 measurements)
41w: 2.231 (20 measurements)


