In [1]:
from utils_prediction.database import (gbq_connect, gbq_query)
from utils_prediction.dataloader.mimic4 import dataloader
from utils_prediction.preprocessor import (fill_missing,discretizer,binary_discretizer,one_hot_encoder,prune_features)

from utils_prediction.nn.models import FixedWidthModel

from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, plot_confusion_matrix

import os
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt

#### Get example features from GBQ and save to disk

In [2]:
## Establish connection with GBQ
c = gbq_connect(
    service_account_json_path = '/hpf/projects/lsung/creds/gbq/mimic.json', # change to your service account auth
    project_id = 'mimic-iv-ches'
    )

## Grab example data
df = gbq_query(c, """
    select * from `mimic-iv-ches.demo.mimic4_slice`
    """, verbose = False)

## Save to disk
path = 'data/analysis_id=demo'

if not os.path.exists(path): os.makedirs(path)

pq.write_table(
    pa.Table.from_pandas(df),
    f"{path}/features.parquet"
    )

Google Big Query Connection Established


#### Load data & split into train,val,test

In [3]:
data = dataloader(
    analysis_id = 'demo',
    features_fpath = 'data'
    ).load_features()
data = data.split()

In [4]:
len(data.X_train), len(data.X_val), len(data.X_test)

(1400, 300, 300)

#### Preprocessing pipeline

In [5]:
## Pipeline
pipe = Pipeline([
    ('fill missings',fill_missing(config={'count':0,'marital_status':'None'})),
    ('prune features',prune_features(special_cols={'count':0})),
    ('discretize counts', binary_discretizer(feature_tags_to_include= ['count'])),
    ('discretize measurements', discretizer(feature_tags_to_include = ['measurement'])),
    ('one hot encode', one_hot_encoder(feature_tags_to_exclude = ['count']))
    ])

#### Preprocess data

In [6]:
data.X_train = pipe.fit_transform(data.X_train)
data.X_val = pipe.transform(data.X_val)
data.X_test = pipe.transform(data.X_test)

#### Generate torch dataloaders

In [7]:
loaders = data.to_torch()

In [8]:
#next(iter(loaders['train']))

#### Torch model

In [9]:
m = FixedWidthModel(input_dim = next(iter(loaders['train']))['features'].shape[1])
#m.model.parameters

cpu


  return torch._C._cuda_getDeviceCount() > 0


In [10]:
m.train(loaders,phases=['train','val'])

Epoch 0/9
----------
Phase: train:
            metric  performance
0              auc     0.522115
1            auprc     0.080049
2            brier     0.130385
3         loss_bce     0.428469
4  specificity_0.5     0.912230
5    precision_0.5     0.079681
6       recall_0.5     0.094093
0             loss     0.428469
Phase: val:
            metric  performance
0              auc     0.626880
1            auprc     0.207200
2            brier     0.069916
3         loss_bce     0.272492
4  specificity_0.5     1.000000
5    precision_0.5     0.000000
6       recall_0.5     0.000000
0             loss     0.272492
Best model updated
Epoch 1/9
----------
Phase: train:
            metric  performance
0              auc     0.624914
1            auprc     0.118459
2            brier     0.066705
3         loss_bce     0.256275
4  specificity_0.5     1.000000
5    precision_0.5     0.000000
6       recall_0.5     0.000000
0             loss     0.256275
Phase: val:
            metric  per

{'performance':      phase  epoch           metric  performance
 0    train      0              auc     0.522115
 1    train      0            auprc     0.080049
 2    train      0            brier     0.130385
 3    train      0         loss_bce     0.428469
 4    train      0  specificity_0.5     0.912230
 ..     ...    ...              ...          ...
 155    val      9         loss_bce     0.230662
 156    val      9  specificity_0.5     1.000000
 157    val      9    precision_0.5     0.000000
 158    val      9       recall_0.5     0.000000
 159    val      9             loss     0.230662
 
 [160 rows x 4 columns]}