# Train MOTOR

This tutorial walks through the various steps to train a MOTOR model.

Training MOTOR is a four step process:

- Training a tokenizer
- Prefitting MOTOR
- Preparing batches
- Training the model

In [1]:
import shutil
import os

# TARGET_DIR = '/data/processed_datasets/processed_datasets/zj2398/femr'

TARGET_DIR = '/user/zj2398/cache/motor'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

In [2]:
import meds_reader
import femr.splits

# First, we want to split our dataset into train, valid, and test
# We do this by calling our split functionality twice

# 
# database = meds_reader.SubjectDatabase('/data/processed_datasets/processed_datasets/ehr_foundation_data/ohdsi_cumc_deid/ohdsi_cumc_deid_2023q4r3_v3_mapped/post_transform_meds_reader')
database = meds_reader.SubjectDatabase('/user/zj2398/cache/hf_ehr/mimic/meds_v0.6_reader')

# use hash split to split the database into train and test (ratio = frac_test)
main_split = femr.splits.generate_hash_split(list(database), 97, frac_test=0.15)

os.mkdir(os.path.join(TARGET_DIR, 'motor_model'))
# Note that we want to save this to the target directory since this is important information

main_split.save_to_csv(os.path.join(TARGET_DIR, "motor_model", "main_split.csv"))

train_split = femr.splits.generate_hash_split(main_split.train_subject_ids, 87, frac_test=0.15)

main_database = database.filter(main_split.train_subject_ids)
train_database = main_database.filter(train_split.train_subject_ids)
val_database = main_database.filter(train_split.test_subject_ids)


In [None]:
import femr.models.tokenizer
import pickle

# First, we need to train a tokenizer
# Note, we need to use a hierarchical tokenizer for MOTOR

with open('input/ontology.pkl', 'rb') as f:
    ontology = pickle.load(f)


# with open('/user/zj2398/cache/motor/input/ontology.pkl', 'rb') as f:
#     ontology = pickle.load(f)

# NOTE: A vocab size of 128 is probably too low for a real model. 128 was chosen to make this tutorial quick to run
# NOTE: Normally you would train the tokenizer on only the train database, but for such a tiny dataset that's not enough
tokenizer = femr.models.tokenizer.HierarchicalTokenizer.train(
    database, vocab_size=1024 * 16, ontology=ontology, min_fraction=1e-9) # Normally min_fraction should be set higher, to 1e-4, but need a small min fraction to get enough codes

# Save the tokenizer to the same directory as the model
tokenizer.save_pretrained(os.path.join(TARGET_DIR, "motor_model"))

KeyboardInterrupt: 

In [None]:
# dict1 = {"a": 1, "b": 2}
# dict1.add("a",2)
# print(dict1)

import meds_reader
database = meds_reader.SubjectDatabase('/user/zj2398/cache/hf_ehr/mimic/meds_v0.6_reader')
print(database.properties)

{'code': DataType(string), 'doses_per_24_hrs': DataType(int64), 'drg_mortality': DataType(int64), 'drg_severity': DataType(int64), 'emar_id': DataType(large_string), 'emar_seq': DataType(int64), 'frequency': DataType(large_string), 'hadm_id': DataType(int64), 'icustay_id': DataType(int64), 'insurance': DataType(large_string), 'language': DataType(large_string), 'link_order_id': DataType(int64), 'marital_status': DataType(large_string), 'numeric_value': DataType(float), 'order_id': DataType(int64), 'ordercategorydescription': DataType(large_string), 'poe_id': DataType(large_string), 'priority': DataType(large_string), 'race': DataType(large_string), 'route': DataType(large_string), 'statusdescription': DataType(large_string), 'text_value': DataType(large_string), 'time': TimestampType(timestamp[us]), 'unit': DataType(large_string)}


In [4]:

import femr.models.tasks
import pickle

# Second, we need to prefit the MOTOR model. This is necessary because piecewise exponential models are unstable without an initial fit
with open('input/ontology.pkl', 'rb') as f:
    ontology = pickle.load(f)
load_path = '/user/zj2398/cache/motor/motor_model/dictionary.msgpack'
tokenizer = femr.models.tokenizer.HierarchicalTokenizer.from_pretrained(pretrained_model_name_or_path=load_path,ontology=ontology)

motor_task = femr.models.tasks.MOTORTask.fit_pretraining_task_info(
    train_database, tokenizer, num_tasks=2048, num_bins=4, final_layer_size=32, min_fraction=1e-9)  # Normally min_fraction should be set higher, to 1e-4, but need a small min fraction to get enough codes

# It's recommended to save this with pickle to avoid recomputing since it's an expensive operation

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/user/zj2398/cache/motor/motor_model/dictionary.msgpack'. Use `repo_type` argument if needed.

In [5]:
import femr.models.processor
import femr.models.tasks

# Third, we need to create batches. 

processor = femr.models.processor.FEMRBatchProcessor(tokenizer, motor_task)

example_subject_id = list(train_database)[0]
example_subject = train_database[example_subject_id]

# We can do this one subject at a time
print("Convert a single subject")
example_batch = processor.collate([processor.convert_subject(example_subject, tensor_type='pt')])

print("Convert batches")
# But generally we want to convert entire datasets
train_batches = processor.convert_dataset(train_database, tokens_per_batch=32, num_proc=4)

print("Convert batches to pytorch")
# Convert our batches to pytorch tensors
train_batches.set_format("pt")
print("Done")

val_batches = processor.convert_dataset(val_database, tokens_per_batch=32, num_proc=4)
# Convert our batches to pytorch tensors
val_batches.set_format("pt")

Convert a single subject
Convert batches
Got batches 46


Generating train split: 46 examples [00:00, 658.70 examples/s]


Convert batches to pytorch
Done
Got batches 9


Generating train split: 9 examples [00:00, 607.23 examples/s]


In [6]:
import transformers

import femr.models.transformer

# Finally, given the batches, we can train CLMBR.
# We can use huggingface's trainer to do this.

transformer_config = femr.models.config.FEMRTransformerConfig(
    vocab_size=tokenizer.vocab_size, 
    is_hierarchical=True, 
    use_normed_ages=True,
    use_bias=False,
    hidden_act='swiglu',
    n_layers=2,
    hidden_size=64, 
    intermediate_size=64*2,
    n_heads=8,
)

config = femr.models.config.FEMRModelConfig.from_transformer_task_configs(transformer_config, motor_task.get_task_config())

model = femr.models.transformer.FEMRModel(config)

collator = processor.collate

trainer_config = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    output_dir='tmp_trainer',
    remove_unused_columns=False,
    num_train_epochs=4,

    eval_steps=20,
    eval_strategy="steps",

    logging_steps=20,
    logging_strategy='steps',

    prediction_loss_only=True,
)

trainer = transformers.Trainer(
    model=model,
    data_collator=processor.collate,
    train_dataset=train_batches,
    eval_dataset=val_batches,
    args=trainer_config,
)

trainer.train()

model.save_pretrained(os.path.join(TARGET_DIR, 'motor_model'))

AssertionError: Can only have one batch when collating