In [1]:
import yaml
from omegaconf import OmegaConf

In [2]:
PROJECT_DIR = "/home/vadams/Projects/entity-linking-research/NeMo/examples/nlp/entity_linking"
SAVE_NAME = "augmented_medical_entity_linking_config.yaml"

In [3]:
index_ds = OmegaConf.create({
    "name": "umls",
    "data_file": "${project_dir}/data/umls_index_concepts.txt",
    "max_seq_length": "${model.max_seq_length}",
    "batch_size": 128,
    "shuffle": False,
    "num_workers": 2,
    "pin_memory": False,
    "drop_last": False,
})

In [4]:
pca = OmegaConf.create({
    "input_dim": 756,
    "output_dim": "${index.dims}",
    "sample_fraction": 0.5,
    "pca_save_name": "${project_dir}/${index.pca.input_dim}_to_${index.pca.output_dim}_pca_model.pkl"
})

In [5]:
index = OmegaConf.create({
    "dims": 256,
    "nlist": 300,
    "query_num_factor": 20,
    "index_save_name":"${project_dir}/medical_entity_linking_index",
    "index_batch_size": 1000,
    "dataset": index_ds,
    "idx_to_id": "${project_dir}/data/idx_to_id.pkl",
    "id_to_string": "${project_dir}/data/id_to_string.pkl",
    "concept_id_save_name": "${project_dir}/data/concept_ids.pkl",
    "embedding_save_name": "${project_dir}/data/medical_concept_embeddings.hdf5",
    "pca_embeddings_save_name": "${project_dir}/data/medical_concept_reduced_${index.dims}dim_embeddings.hdf5",
    "apply_pca": True,
    "pca": pca,
})

In [6]:
language_model = OmegaConf.create({
    "pretrained_model_name": "bert-base-uncased",
    "config_file": None,
    "config": None,
    "lm_checkpoint": None,
    
})

In [7]:
tokenizer = OmegaConf.create({
    "tokenizer_name": "${model.language_model.pretrained_model_name}",
    #"tokenizer_name": "bert-base-uncased",
    "vocab_file": None,
    "tokenizer_model": None,
    "do_lower_case": True,
})

In [8]:
train_ds = OmegaConf.create({
    "data_file": "${project_dir}/data/augmented_umls_train_pairs.txt",
    "max_seq_length": "${model.max_seq_length}",
    "batch_size": 128,
    "shuffle": True,
    "num_workers": 2,
    "pin_memory": False,
    "drop_last": False,
})

In [9]:
validation_ds = OmegaConf.create({
    "data_file": "${project_dir}/data/augmented_umls_validation_pairs.txt",
    "max_seq_length": "${model.max_seq_length}",
    "batch_size": 128,
    "shuffle": False,
    "num_workers": 2,
    "pin_memory": False,
    "drop_last": False,
})

In [10]:
sched = OmegaConf.create({
    "name": "CosineAnnealing",
    "warmup_steps": None,
    "warmup_ratio": 0.1,
    "min_lr": 0.0,
    "last_epoch": -1,
})

optim = OmegaConf.create({
    "name": "adam",
    "lr": 3e-5,
    "weight_decay": 0.0,
    "sched": sched,
})

In [11]:
model = OmegaConf.create({
    "nemo_path": "${project_dir}/full_umls_sap_bert_model.nemo",
    "max_seq_length": 128,
    "language_model": language_model,
    "tokenizer": tokenizer,
    "train_ds": train_ds,
    "validation_ds": validation_ds,
    "optim": optim,
})

In [12]:
trainer = OmegaConf.create({
    "gpus": 2,
    "num_nodes": 1,
    "max_epochs": 1,
    "max_steps": None,
    "accumulate_grad_batches": 1,
    "precision": 16,
    "amp_level": "O1",
    "accelerator": "ddp",
    "gradient_clip_val": 0.0,
    "log_every_n_steps": 1,
    "val_check_interval": 5000,
    "checkpoint_callback": False,
    "logger": False
    
})

In [13]:
exp_manager = OmegaConf.create({
    "exp_dir": "${project_dir}/medical_entity_linking_experiments",
    "name": "SelfAlignmentPretrainingUMLS",
    "create_tensorboard_logger": True,
    "create_checkpoint_callback": True,
})

In [14]:
hydra = OmegaConf.create({
    "run": OmegaConf.create({
        "dir": "."
    }),
    
    "job_logging": OmegaConf.create({
        "root": OmegaConf.create({
            "handlers": None
        })
    }) 
})

In [15]:
config = OmegaConf.create({
    "project_dir": PROJECT_DIR,
    "name": "SelfAlignmentPretrainingForMedicalEntityLinking",
    "trainer": trainer,
    "model": model,
    "index": index,
    "exp_manager": exp_manager,
    "hydra": hydra,
})

In [16]:
print(OmegaConf.to_yaml(config))
config_file = open(f"{PROJECT_DIR}/conf/{SAVE_NAME}", "w")
config = OmegaConf.to_yaml(config)
yaml.dump(config, config_file)

print("\nConfig file updated and saved.\n")

project_dir: /home/vadams/Projects/entity-linking-research/NeMo/examples/nlp/entity_linking
name: SelfAlignmentPretrainingForMedicalEntityLinking
trainer:
  gpus: 2
  num_nodes: 1
  max_epochs: 1
  max_steps: null
  accumulate_grad_batches: 1
  precision: 16
  amp_level: O1
  accelerator: ddp
  gradient_clip_val: 0.0
  log_every_n_steps: 1
  val_check_interval: 5000
  checkpoint_callback: false
  logger: false
model:
  nemo_path: ${project_dir}/full_umls_sap_bert_model.nemo
  max_seq_length: 128
  language_model:
    pretrained_model_name: bert-base-uncased
    config_file: null
    config: null
    lm_checkpoint: null
  tokenizer:
    tokenizer_name: ${model.language_model.pretrained_model_name}
    vocab_file: null
    tokenizer_model: null
    do_lower_case: true
  train_ds:
    data_file: ${project_dir}/data/augmented_umls_train_pairs.txt
    max_seq_length: ${model.max_seq_length}
    batch_size: 128
    shuffle: true
    num_workers: 2
    pin_memory: false
    drop_last: false
 