### Import Libraries, Mount Storage, Load Data

In [None]:
import json
import pandas as pd
import numpy as np

from collections import defaultdict, Counter

from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

In [None]:
# Cam be 'add', 'onset' or 'hist'
target_var = 'onset'
seed = 42

In [None]:
########## For Colab ##########
!pip install ts2vec
from ts2vec import TS2Vec

########## Personal ##########
from google.colab import drive
drive.mount('/content/drive')
with open(f'/content/drive/MyDrive/datasets/dataset_{target_var}.json') as f:
    content = f.read()
    data = json.loads(content)

########## Enterprise ##########
# import gcsfs
# fs = gcsfs.GCSFileSystem()
# with fs.open('gs://modoo-eod/users/datasets/dataset_hist.json') as f:
#     content = f.read()
#     data = json.loads(content)

########## Local ##########
# with open("../../datasets/dataset_hist.json") as f:
#     content=f.read()
#     data=json.loads(content)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Cleaning

* There are null values present in the `static` field ; null values come from missing "Age" and "BMI"

* Replace null values with the respective means

In [None]:
df = pd.DataFrame.from_records(data)

print(len(df), "Measurements")

age_mean = np.mean([i[0] for i in df['static'] if pd.notna(i[0])])
bmi_mean = np.mean([i[1] for i in df['static'] if pd.notna(i[1])])

cleaned_data = []
for _, m in enumerate(data):

    # Remove measurements with empty windows
    if len(m['uc_windows']) == 0 or len(m['fhr_windows']) == 0:
        continue

    # Handle NaN values
    static = m['static'].copy()
    if pd.isna(m['static'][0]):
        static[0] = age_mean
    if pd.isna(m['static'][1]):
        static[1] = bmi_mean

    copy = m.copy()
    copy['static'] = static
    cleaned_data.append(copy)

cleaned_df = pd.DataFrame(cleaned_data)

print(len(cleaned_df), "Cleaned Measurements")

3681 Measurements
3661 Cleaned Measurements


### Train-Test Split (Stratified)

* Stratified splitting of Train-Test groups ; ensures similar gestational age distribution in Train and Test groups

In [None]:
########## DATASET 2A ##########
# cleaned_df["gest_age_weeks"] = [(i[-1]//7)+1 for i in cleaned_df["static"]]

# df_train = cleaned_df.groupby(
#     "gest_age_weeks",
#     group_keys=False
# ).apply(lambda x: x.sample(frac=0.8, random_state=seed), include_groups=True)

# df_test = cleaned_df.drop(df_train.index)

# train = df_train.to_dict(orient='records')
# test  = df_test.to_dict(orient='records')

# print(f"Train : {len(train)} measurements")
# print(f"Test  : {len(test)} measurements")
##############################

########## DATASET 2B ##########
cleaned_df["gest_age_weeks"] = [(i[-1]//7)+1 for i in cleaned_df["static"]]

patient_ids = cleaned_df['mobile'].unique()

rng = np.random.RandomState(seed)
rng.shuffle(patient_ids)

n_train_patients = int(0.8 * len(patient_ids))
train_patients = patient_ids[:n_train_patients]
test_patients  = patient_ids[n_train_patients:]

df_train = cleaned_df[cleaned_df['mobile'].isin(train_patients)].copy()
df_test  = cleaned_df[cleaned_df['mobile'].isin(test_patients)].copy()

train = df_train.to_dict(orient="records")
test  = df_test.to_dict(orient="records")

print(f"Train : {len(train)} measurements from {len(train_patients)} patients")
print(f"Test  : {len(test)} measurements from {len(test_patients)} patients")
##############################

print()
print("Fields:")
for _, k in enumerate(train[0]):
    print(f"{_+1}: {k}")

Train : 2698 measurements from 45 patients
Test  : 963 measurements from 12 patients

Fields:
1: mobile
2: measurement_date
3: static
4: uc_raw
5: fhr_raw
6: uc_windows
7: fhr_windows
8: target
9: gest_age_weeks


### Pre-Compute TS2Vec Embeddings

* Each measurement (2048 seconds) is converted to a 320-dimension embedding

* Measurements must be passed to TS2Vec in the format `(n_instances x n_timestamps x n_features)`

In [None]:
# Train UC, FHR
train_uc  = np.expand_dims(np.array([i['uc_raw'] for i in train]), 2)
train_fhr = np.expand_dims(np.array([i['fhr_raw'] for i in train]), 2)
# print("Train UC  :", train_uc.shape)
# print("Train FHR :", train_fhr.shape)

# Test UC, FHR
test_uc  = np.expand_dims(np.array([i['uc_raw'] for i in test]), 2)
test_fhr = np.expand_dims(np.array([i['fhr_raw'] for i in test]), 2)
# print("Test UC   :", test_uc.shape)
# print("Test FHR  :", test_fhr.shape)

ts_model = TS2Vec(
    input_dims = 1,
    output_dims = 320,
    device = 0,
    batch_size = 32
)

# Train UC, FHR embeddings
train_uc_embed    = ts_model.encode(train_uc, encoding_window="full_series")
train_fhr_embed   = ts_model.encode(train_fhr, encoding_window="full_series")

# Test UC, FHR embeddings
test_uc_embed     = ts_model.encode(test_uc, encoding_window="full_series")
test_fhr_embed    = ts_model.encode(test_fhr, encoding_window="full_series")

for idx, e in enumerate(train_uc_embed):
    train[idx]['uc_raw'] = e

for idx, e in enumerate(train_fhr_embed):
    train[idx]['fhr_raw'] = e

for idx, e in enumerate(test_uc_embed):
    test[idx]['uc_raw'] = e

for idx, e in enumerate(test_fhr_embed):
    test[idx]['fhr_raw'] = e

### Aggregate Windows

* Each measurement has different number of UC windows and FHR windows

* Each UC window has 20 features whereas each FHR window has 24 features

* Aggregate UC/FHR windows from each measurement since LGBM expects input of consistent size

In [None]:
for i in train:

    uc_w  = np.array([[v for _, v in w.items()] for w in i['uc_windows']], dtype=np.float32)
    fhr_w = np.array([[v for _, v in w.items()] for w in i['fhr_windows']], dtype=np.float32)

    i['uc_windows']  = uc_w.mean(axis=0)
    i['fhr_windows'] = fhr_w.mean(axis=0)

    assert len(i['uc_windows']) == 20
    assert len(i['fhr_windows']) == 24

for i in test:

    uc_w = np.array([[v for _, v in w.items()] for w in i['uc_windows']], dtype=np.float32)
    fhr_w = np.array([[v for _, v in w.items()] for w in i['fhr_windows']], dtype=np.float32)

    i['uc_windows']  = uc_w.mean(axis=0)
    i['fhr_windows'] = fhr_w.mean(axis=0)

    assert len(i['uc_windows']) == 20
    assert len(i['fhr_windows']) == 24

### Build Dataset

* Concatenate features from each measurement together to form a single vector

In [None]:
FEATURE_ORDER = ["uc_raw", "fhr_raw", "fhr_windows", "uc_windows", "static"]
FEATURE_DIMS  = {"uc_raw":320, "fhr_raw":320, "fhr_windows":24, "uc_windows":20, "static":8}

def to_feature_vec(m):

    parts = []

    for k in FEATURE_ORDER:

        v = np.asarray(m[k], dtype=np.float32).ravel()

        assert v.size == FEATURE_DIMS[k]

        parts.append(v)

    x = np.concatenate(parts).astype(np.float32, copy=False)

    return x

def make_xy(measurements):

    X = np.stack([to_feature_vec(m) for m in measurements], axis=0)

    y = np.asarray([m['target'] for m in measurements], dtype=np.float32).ravel()

    groups = [m['gest_age_weeks'] for m in measurements]

    return X, y, groups

X_train, y_train, train_groups  = make_xy(train)
X_test, y_test, test_groups     = make_xy(test)

print(f"Train: {X_train.shape}")
train_groups_count = dict(Counter(sorted(train_groups)))
for k, v in train_groups_count.items():
    print(f"{k}w: {v}")

print()

print(f"Test: {X_test.shape}")
test_groups_count = dict(Counter(sorted(test_groups)))
for k, v in test_groups_count.items():
    print(f"{k}w: {v}")

Train: (2698, 692)
29w: 24
30w: 51
31w: 59
32w: 94
33w: 177
34w: 227
35w: 260
36w: 319
37w: 390
38w: 429
39w: 368
40w: 232
41w: 67
42w: 1

Test: (963, 692)
29w: 1
30w: 21
31w: 46
32w: 63
33w: 77
34w: 86
35w: 111
36w: 124
37w: 113
38w: 115
39w: 95
40w: 89
41w: 22


### Hyperparameter Tuning

In [None]:
# train_patient_ids = [m["mobile"] for m in train]
# cv = GroupKFold(n_splits=5)

# base_lgbm = LGBMRegressor(
#     objective="regression",
#     random_state=seed,
#     n_jobs=-1,
#     verbose=-1
# )

# param_dist = {
#     "n_estimators"      : np.arange(500, 4001, 250),
#     "learning_rate"     : np.logspace(-3, -1, 10),
#     "subsample"         : np.linspace(0.6, 1.0, 5),
#     "colsample_bytree"  : np.linspace(0.6, 1.0, 5),
#     "num_leaves"        : np.arange(16, 256, 8),
#     "max_depth"         : [-1, 4, 6, 8, 10, 12],
# }

# search = RandomizedSearchCV(
#     estimator=base_lgbm,
#     param_distributions=param_dist,
#     n_iter=50,
#     scoring="neg_mean_absolute_error",
#     cv=cv,
#     verbose=3,
#     random_state=seed
# )

# search.fit(X_train, y_train, groups=train_patient_ids)

# print("Best CV MAE:", -search.best_score_)
# print("Best params:")
# for k, v in search.best_params_.items():
#     print(f"  {k}: {v}")

### Training

In [None]:
# Tuned 2 LGBM
lgbm = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    subsample=0.6,
    colsample_bytree=0.7,
    num_leaves=32,
    max_depth=8,
    min_child_samples=200,
    reg_alpha=5.0,
    reg_lambda=2.0,
    random_state=seed
)

# # Tuned LGBM
# lgbm = LGBMRegressor(
#     n_estimators=750,
#     learning_rate=0.021544346900318832,
#     subsample=0.8,
#     colsample_bytree=0.9,
#     num_leaves=144,
#     max_depth=-1,
#     random_state=seed
# )

# # Default LGBM
# lgbm = LGBMRegressor(
#     n_estimators=2000,
#     learning_rate=0.03,
#     subsample=0.9,
#     colsample_bytree=0.8,
#     num_leaves=31,
#     max_depth=-1,
#     random_state=seed
# )

lgbm.fit(X_train, y_train)

train_pred = lgbm.predict(X_train)
test_pred  = lgbm.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010283 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170972
[LightGBM] [Info] Number of data points in the train set: 2698, number of used features: 678
[LightGBM] [Info] Start training from score 24.640443




### Evaluation

In [None]:

print("Train MAE :", mean_absolute_error(y_train, train_pred))
print("Test  MAE :", mean_absolute_error(y_test,  test_pred))
print()

test_error_groups = defaultdict(list)
test_abs_error = np.abs(test_pred-y_test)

for g, e in zip(test_groups, test_abs_error):

    test_error_groups[g].append(e)

print("Test MAE by Gestational Age Weeks:")
for k, v in sorted(test_error_groups.items(), key=lambda x: x[0]):

    print(f"{k}w: {np.mean(v):.3f} ({len(v)} measurements)")

Train MAE : 1.6819221174826064
Test  MAE : 7.234949692007967

Test MAE by Gestational Age Weeks:
29w: 23.926 (1 measurements)
30w: 12.108 (21 measurements)
31w: 13.678 (46 measurements)
32w: 13.873 (63 measurements)
33w: 8.880 (77 measurements)
34w: 6.206 (86 measurements)
35w: 6.403 (111 measurements)
36w: 6.794 (124 measurements)
37w: 7.254 (113 measurements)
38w: 6.739 (115 measurements)
39w: 5.417 (95 measurements)
40w: 3.097 (89 measurements)
41w: 1.367 (22 measurements)
