In [1]:
# Import py file to jupyter
import os, sys
PROJECT_HOME = "/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref"
PROJECT_SRC = PROJECT_HOME + "/src"
sys.path.append(os.path.abspath(f"{PROJECT_SRC}"))

# Inference Model

In [2]:
yamlStr_inference = """
metrics:
- MUC
- Bcub
- CEAFE
keep_singletons: true
seed: 45
train: true
eval_all: false
use_wandb: true
paths:
  resource_dir: ${infra.work_dir}/../../coref_resources
  base_data_dir: ${paths.resource_dir}/data
  conll_scorer: ${paths.resource_dir}/reference-coreference-scorers/scorer.pl
  base_model_dir: ${infra.work_dir}/../models
  model_dir: .//../models/coref_ontonotes_0eee8b4bcf686c0970cfa299fe9790f0
  best_model_dir: .//../models/coref_ontonotes_0eee8b4bcf686c0970cfa299fe9790f0/best
  model_filename: model.pth
  model_name: None
  model_name_prefix: coref_
  model_path: /share/data/speech/shtoshni/research/fast-coref/models/coref_ontonotes_0eee8b4bcf686c0970cfa299fe9790f0/model.pth
  best_model_path: /share/data/speech/shtoshni/research/fast-coref/models/coref_ontonotes_0eee8b4bcf686c0970cfa299fe9790f0/best/model.pth
  doc_encoder_dirname: doc_encoder
datasets:
  ontonotes:
    name: OntoNotes
    cluster_threshold: 2
    canonical_cluster_threshold: 2
    targeted_eval: false
    num_train_docs: 2802
    num_dev_docs: 343
    num_test_docs: 348
    has_conll: true
    singleton_file: ontonotes/ment_singletons_longformer_speaker/60.jsonlines
model:
  doc_encoder:
    transformer:
      name: longformer
      model_size: large
      model_str: allenai/longformer-large-4096
      max_encoder_segment_len: 4096
      max_segment_len: 4096
    chunking: independent
    finetune: true
    add_speaker_tokens: true
    speaker_start: '[SPEAKER_START]'
    speaker_end: '[SPEAKER_END]'
  memory:
    mem_type:
      name: unbounded
      max_ents: None
      eval_max_ents: None
    emb_size: 20
    mlp_size: 3000
    mlp_depth: 1
    sim_func: hadamard
    entity_rep: wt_avg
    num_feats: 2
  mention_params:
    max_span_width: 20
    ment_emb: attn
    use_gold_ments: false
    use_topk: false
    top_span_ratio: 0.4
    emb_size: 20
    mlp_size: 3000
    mlp_depth: 1
    ment_emb_to_size_factor:
      attn: 3
      endpoint: 2
      max: 1
  metadata_params:
    use_genre_feature: false
    default_genre: nw
    genres:
    - bc
    - bn
    - mz
    - nw
    - pt
    - tc
    - wb
optimizer:
  init_lr: 0.0003
  fine_tune_lr: 1.0e-05
  max_gradient_norm: 1.0
  lr_decay: linear
trainer:
  dropout_rate: 0.3
  label_smoothing_wt: 0.1
  ment_loss: all
  normalize_loss: false
  max_evals: 20
  to_save_model: true
  log_frequency: 250
  patience: 10
  eval_per_k_steps: 5000
  num_training_steps: 100000
infra:
  is_local: false
  job_time: 14280
  job_id: 72519194
  work_dir: ./
"""

In [3]:
import torch
from os import path
# from model.utils import action_sequences_to_clusters
from model.entity_ranking_model import EntityRankingModel
# from inference.tokenize_doc import tokenize_and_segment_doc, basic_tokenize_doc
from omegaconf import OmegaConf
from transformers import AutoModel, AutoTokenizer

inference_config = OmegaConf.create(yamlStr_inference)

In [4]:
import torch
from data_processing.utils import split_into_segments, get_sentence_map


class DocumentState:
    def __init__(self):
        self.sentence_end = []
        self.token_end = []
        self.orig_tokens = []
        self.tokens = []
        self.subtokens = []
        self.segments = []
        self.subtoken_map = []
        self.segment_subtoken_map = []
        self.sentence_map = []
        self.tensorized_sent = []
        self.sent_len_list = []

    def finalize(self):
        subtoken_map = flatten(self.segment_subtoken_map)
        num_words = len(flatten(self.segments))
        assert num_words == len(subtoken_map), (num_words, len(subtoken_map))

        return {
            "orig_tokens": self.orig_tokens,
            "sentences": self.segments,
            "sent_len_list": self.sent_len_list,
            "tensorized_sent": self.tensorized_sent,
            "sentence_map": torch.tensor(
                get_sentence_map(self.segments, self.sentence_end)
            ),
            "subtoken_map": subtoken_map,
        }


def flatten(l):
    return [item for sublist in l for item in sublist]


def get_tokenized_doc(doc, subword_tokenizer):
    document_state = DocumentState()

    word_idx = -1
    for sentence in doc:
        for word in sentence:
            document_state.orig_tokens.append(word)
            subtokens = subword_tokenizer.convert_tokens_to_ids(
                subword_tokenizer.tokenize(" " + word)
            )
            document_state.tokens.append(word)
            document_state.token_end += ([False] * (len(subtokens) - 1)) + [True]
            word_idx += 1
            for sidx, subtoken in enumerate(subtokens):
                document_state.subtokens.append(subtoken)
                document_state.sentence_end.append(False)
                document_state.subtoken_map.append(word_idx)

        document_state.sentence_end[-1] = True

    # print(document_state.subtokens)
    return document_state


def basic_tokenize_doc(doc_str, basic_tokenizer):
    doc = []
    for sent in basic_tokenizer(doc_str).sents:
        wordlist = [str(word) for word in sent]
        doc.append(wordlist)

    return doc


def tokenize_and_segment_doc(
    basic_tokenized_doc, subword_tokenizer, max_segment_len=4096
):
    document_state: DocumentState = get_tokenized_doc(
        basic_tokenized_doc, subword_tokenizer
    )
    document = post_tokenization_processing(
        document_state, subword_tokenizer, max_segment_len=max_segment_len
    )

    return document


def post_tokenization_processing(
    document_state: DocumentState, subword_tokenizer, max_segment_len=4096
):
    split_into_segments(
        document_state,
        max_segment_len,
        document_state.sentence_end,
        document_state.token_end,
    )

    # sentences = [lm_tokenizer.convert_tokens_to_ids(sent) for sent in document_state.segments]
    sent_len_list = [len(sent) for sent in document_state.segments]
    document_state.sent_len_list = sent_len_list
    document_state.segments_indices = document_state.segments

    # # Tensorize sentence - Streaming coreference is done one window at a time, so no padding is required
    tensorized_sent = [
        torch.unsqueeze(
            torch.tensor(
                [subword_tokenizer.cls_token_id]
                + sent
                + [subword_tokenizer.sep_token_id]
            ),
            dim=0,
        )
        for sent in document_state.segments
    ]
    document_state.tensorized_sent = tensorized_sent
    return document_state.finalize()

In [5]:
def action_sequences_to_clusters(actions, mentions):
    clusters = []
    cell_to_clusters = {}

    for mention, (cell_idx, action_type) in zip(mentions, actions):
        # print(f"mention:{mention}, cell_idx:{cell_idx}, action_type:{action_type}")
        # mention:[17, 17], cell_idx:2, action_type:o
        # cell_to_clusters:{0: [[0, 14]], 1: [[5, 5]], 2: [[17, 17]]}
        # mention:[19, 19], cell_idx:1, action_type:c
        # cell_to_clusters:{0: [[0, 14]], 1: [[5, 5], [19, 19]], 2: [[17, 17]]}
        
        if action_type == "c":
            # Insert one to the existing cluster
            cell_to_clusters[cell_idx].append(mention)
        elif action_type == "o":
            # Overwrite
            if cell_idx in cell_to_clusters:
                # Remove (Save) the old cluster and initialize the new
                clusters.append(cell_to_clusters[cell_idx])
            # Create a cluster 
            cell_to_clusters[cell_idx] = [mention]
        elif action_type == "n":
            # Directly save the cluster with only one element
            clusters.append([mention])
        # print(f"cell_to_clusters:{cell_to_clusters}")
        # print(f"clusters:{clusters}")

    for cell_idx, cluster in cell_to_clusters.items():
        clusters.append(cluster)
    # print(f"final clusters:{clusters}")

    return clusters

In [13]:
class Inference:
    def __init__(self, model_path, encoder_name=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load model
        checkpoint = torch.load(
            path.join(model_path, "model.pth"), map_location=self.device
        )
        self.config = OmegaConf.create(checkpoint["config"])
        if encoder_name is not None:
            self.config.model.doc_encoder.transformer.model_str = encoder_name
        self.model = EntityRankingModel(self.config.model, self.config.trainer)
        self._load_model(checkpoint, model_path, encoder_name=encoder_name)

        self.max_segment_len = self.config.model.doc_encoder.transformer.max_segment_len
        self.tokenizer = self.model.mention_proposer.doc_encoder.tokenizer

    def _load_model(self, checkpoint, model_path, encoder_name=None):
        self.model.load_state_dict(checkpoint["model"], strict=False)

        if self.config.model.doc_encoder.finetune:
            # Load the document encoder params if encoder is finetuned
            if encoder_name is None:
                doc_encoder_dir = path.join(
                    model_path, self.config.paths.doc_encoder_dirname
                )
            else:
                doc_encoder_dir = encoder_name
            # Load the encoder
            print(f"Loading [{doc_encoder_dir}] as mention_proposer.doc_encoder encoder and tokenizer")
            self.model.mention_proposer.doc_encoder.lm_encoder = (
                AutoModel.from_pretrained(
                    pretrained_model_name_or_path=doc_encoder_dir
                )
            )
            self.model.mention_proposer.doc_encoder.tokenizer = (
                AutoTokenizer.from_pretrained(
                    pretrained_model_name_or_path=doc_encoder_dir
                )
            )

            if torch.cuda.is_available():
                self.model.cuda()

        self.model.eval()

    @torch.no_grad()
    def perform_coreference(self, document):
        if isinstance(document, list):
            # Document is already tokenized
            tokenized_doc = tokenize_and_segment_doc(
                document, self.tokenizer, max_segment_len=self.max_segment_len
            )
        elif isinstance(document, str):
            # Raw document string. First perform basic tokenization before further tokenization.
            import spacy

            basic_tokenizer = spacy.load("en_core_web_md")
            basic_tokenized_doc = basic_tokenize_doc(document, basic_tokenizer)
            print(f"subword_tokenizer:{self.tokenizer}")
            tokenized_doc = tokenize_and_segment_doc(
                basic_tokenized_doc,
                self.tokenizer,
                max_segment_len=self.max_segment_len,
            )
        elif isinstance(document, dict):
            tokenized_doc = document
        else:
            raise ValueError

        pred_mentions, mention_scores, gt_actions, pred_actions = self.model(tokenized_doc)
        # print(f"pred_mentions:{pred_mentions}")
        # The first and last indices of the mentions. The indices refer to the indices of tokenized_doc["subtoken_map"]
        # Then we need to use the values in tokenized_doc["subtoken_map"] to map the actual tokens which are stored in tokenized_doc["orig_tokens"]
        # pred_mentions:[[0, 14], [5, 5], [17, 17], [19, 19], [25, 25], [32, 32], [35, 35], [35, 37], [43, 44], [47, 47], [48, 53], [48, 54], [60, 60], [65, 65]]

        # print(f"pred_actions:{pred_actions}")
        # The first value is the id of cluster to which the mention belong. 
        # The second character means action: 
        # [c] will insert one mention to the existing cluster.
        # [o] will create a new cluster with the current mention and waiting for other mentions to add in, 
        # or save the old cluster and initialize a new cluster if this id already create a cluster before.
        # [n] will directly save the cluster with only one mention
        # pred_actions:[(0, 'o'), (1, 'o'), (2, 'o'), (1, 'c'), (2, 'c'), (3, 'o'), (1, 'c'), (4, 'o'), (4, 'c'), (5, 'o'), (1, 'c'), (4, 'c'), (3, 'c'), (5, 'c')]

        idx_clusters = action_sequences_to_clusters(pred_actions, pred_mentions)
        # print(f"idx_clusters:{idx_clusters}")
        # idx_clusters:[[[0, 14]], [[5, 5], [19, 19], [35, 35], [48, 53]], [[17, 17], [25, 25]], [[32, 32], [60, 60]], [[35, 37], [43, 44], [48, 54]], [[47, 47], [65, 65]]]

        # The values in idx_clusters/pred_mentions refers to the indices of tokenized_doc["subtoken_map"], 
        # while the values in tokenized_doc["subtoken_map"] refers to the indces of tokenized_doc["orig_tokens"].
        subtoken_map = tokenized_doc["subtoken_map"]
        orig_tokens = tokenized_doc["orig_tokens"]
        clusters = []
        coref_group_list = []
        for idx_cluster in idx_clusters:
            cur_cluster = []
            coref_group = []
            for (ment_start, ment_end) in idx_cluster:
                coref_group.append(list(range(subtoken_map[ment_start], subtoken_map[ment_end] + 1)))
                cur_cluster.append(
                    (
                        (ment_start, ment_end),
                        " ".join(
                            orig_tokens[
                                subtoken_map[ment_start] : subtoken_map[ment_end] + 1
                            ]
                        ),
                    )
                )
            coref_group_list.append(coref_group)
            clusters.append(cur_cluster)

        return {
            "tokenized_doc": tokenized_doc,
            "clusters": clusters,
            "subtoken_idx_clusters": idx_clusters,
            "actions": pred_actions,
            "mentions": pred_mentions,
            "coref_group_list": coref_group_list,
        }


In [7]:
import pandas as pd

REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pd.read_json(REPORT_PATH, orient="records", lines=True)
print(df)

pid_list = df.loc[:, "pid"].to_list()
sid_list = df.loc[:, "sid"].to_list()
findings_list = df.loc[:, "findings"].to_list()
impression_list = df.loc[:, "impression"].to_list()
pfi_list = df.loc[:, "provisional_findings_impression"].to_list()
fai_list = df.loc[:, "findings_and_impression"].to_list()

DATA_SIZE = len(sid_list)

2022-08-26 22:18:56,463 - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-08-26 22:18:56,463 - NumExpr defaulting to 8 threads.


              pid        sid  \
0       p10000032  s50414267   
1       p10000032  s53189527   
2       p10000032  s53911762   
3       p10000032  s56699142   
4       p10000764  s57375967   
...           ...        ...   
227830  p19999442  s58708861   
227831  p19999733  s57132437   
227832  p19999987  s55368167   
227833  p19999987  s58621812   
227834  p19999987  s58971208   

                                                 findings  \
0       There is no focal consolidation, pleural effus...   
1       The cardiac, mediastinal and hilar contours ar...   
2       Single frontal view of the chest provided. \n ...   
3       The lungs are clear of focal consolidation, pl...   
4       PA and lateral views of the chest provided.   ...   
...                                                   ...   
227830  ET tube ends 4.7 cm above the carina.  NG tube...   
227831  The lungs are clear, and the cardiomediastinal...   
227832  There has been interval extubation and improve...   
22783

In [15]:
model_path = "/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/models/joint_best"
doc_path = "/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/models/longformer_coreference_joint"
model = Inference(model_path, doc_path)

# doc = " ".join(open("/home/shtoshni/Research/coref_resources/data/ccarol/doc.txt").readlines())
doc = impression_list[sid_list.index("s57195248")]
output_dict = model.perform_coreference(doc)
print(f"\noutput_dict:{output_dict}")
print(f"\nclusters:{output_dict['clusters']}")
# [[((0, 14), 'The practice of referring to Voldemort as " He Who Must Not Be Named "')], 
# [((5, 5), 'Voldemort'), ((19, 19), 'he'), ((35, 35), 'his'), ((48, 53), 'the Dark Lord ’s')], 
# [((17, 17), 'begun'), ((25, 25), 'This')], 
# [((32, 32), 'Dumbledore'), ((60, 60), 'he')], 
# [((35, 37), 'his proper name'), ((43, 44), 'the name'), ((48, 54), 'the Dark Lord ’s name')], 
# [((47, 47), 'saying'), ((65, 65), 'it')]]

Loading [/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/models/longformer_coreference_joint] as mention_proposer.doc_encoder encoder and tokenizer
subword_tokenizer:PreTrainedTokenizerFast(name_or_path='/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/models/longformer_coreference_joint', vocab_size=50265, model_max_len=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False), 'additional_special_tokens': ['[SPEAKER_START]', '[SPEAKER_END]']})

output_dict:{'tokenized_doc': {'orig_tokens': ['1', '.', ' ', 'Increased', 'diffuse', 'interstitial', 'abnormality', ',', 'likely', 'reflecting', 'worsening', 'mild', 'interstitial', 'pulmonary', 'edema', '.', '\n ', '2', '.', ' ', 'Decreased', 'bibasilar', 'minimal', 'atelecta

In [11]:
for cluster in output_dict["clusters"]:
  if len(cluster) > 1:
    print(cluster)

In [12]:
for cluster in output_dict["clusters"]:
  print(cluster)

[((13, 19), 'worsening mild interstitial pulmonary edema')]
[((30, 37), 'bibasilar minimal atelectasis')]
[((46, 52), 'No evidence of pneumothorax')]
[((49, 52), 'pneumothorax')]
[((58, 66), 'new right IJ central venous catheter')]


In [16]:
for cluster in output_dict["coref_group_list"]:
  print(cluster)

[[10, 11, 12, 13, 14]]
[[21, 22, 23]]
[[29, 30, 31, 32]]
[[32]]
[[38, 39, 40, 41, 42, 43]]


# Command

Basic command on mylinux

In [None]:
python main.py infra=mylinux experiment=mylinux_test paths.model_name=i2b2_test

Specify the model_name, and contine training on that model (when we update the trainer config and want to continue from that checkpoint)

In [None]:
python main.py infra=mylinux experiment=mylinux_test paths.model_name=joint_mylinux_quizbowl continue_training=True

Train on arcca

12h = 42500

In [None]:
!HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
python3 /scratch/c.c21051562/workspace/fast-coref/src/main.py infra=arcca experiment=arcca_test infra.job_time=50000 infra.job_id=1001

Eval on mylinux

In [None]:
python main.py infra=mylinux experiment=mylinux_test train=False paths.model_dir=/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/models/joint_best

Eval scorer.pl

In [None]:
/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/coref_resources/reference-coreference-scorers/scorer.pl muc /home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/coref_resources/data/i2b2/conll/0/dev.conll /home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/models/joint_best/i2b2/dev.conll none