In [None]:
from datetime import datetime
import pytz
import os

run_name = "clinical-xlm-roberta-crf-strict"

model_checkpoint = "checkpoint_path"
num_labels = 3

data_path = "path_to_dataset_split_into_sentences"
original_texts_path = "path_to_original_symptemist_texts"
offsets_path = "path_to_sentence_offsets"
output_root_path = f"output_path"
output_per_file_path = f"{output_root_path}/per_file" # inferred labels by file

In [None]:
# If running in Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install pytorch-crf

In [None]:
from transformers import AutoTokenizer, pipeline, TokenClassificationPipeline

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### Model

In [None]:
import torch
from torch import nn
from torchcrf import CRF
from transformers import XLMRobertaPreTrainedModel, XLMRobertaModel, TrainingArguments, Trainer
from transformers.modeling_outputs import  TokenClassifierOutput

class XLMRobertaWithCRF(XLMRobertaPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(num_tags=self.num_labels, batch_first=True)
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            mask = torch.where(labels == -100, torch.tensor(0), torch.tensor(1))
            mask[:, 0] = 1 # shameless hack
            mask = mask.bool()
            hacked_labels = torch.where(labels == -100, torch.tensor(0), labels) # another one, thanks https://github.com/kmkurn/pytorch-crf/issues/41
            log_likelihood, tags = self.crf(logits, hacked_labels, mask=mask), self.crf.decode(logits)
            loss = 0 - log_likelihood
        else:
            tags = self.crf.decode(logits)
        tags = torch.Tensor(tags).int()

        output = (tags,) + outputs[2:]
        # print("Returning: ", ((loss,) + output) if loss is not None else output)
        return ((loss,) + output) if loss is not None else output

In [None]:
model = XLMRobertaWithCRF.from_pretrained(model_checkpoint, num_labels=3)

In [None]:
import types
import warnings
from typing import List, Optional, Tuple, Union
from transformers import pipeline
import numpy as np

from transformers.pipelines.base import ChunkPipeline
from transformers.pipelines.token_classification import TokenClassificationArgumentHandler, AggregationStrategy
from transformers.models.bert.tokenization_bert import BasicTokenizer

In [None]:
# Having a custom model means we cannot use the regular NER pipeline, so we will modify it
class CustomTokenClassificationPipeline(ChunkPipeline):
    """
    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
    examples](../task_summary#named-entity-recognition) for more information.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
    >>> tokens = token_classifier(sentence)
    >>> tokens
    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]

    >>> token = tokens[0]
    >>> # Start and end provide an easy way to highlight words in the original text.
    >>> sentence[token["start"] : token["end"]]
    ' jean-baptiste'

    >>> # Some models use the same idea to do part of speech.
    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
    >>> syntaxer("My name is Sarah and I live in London")
    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).

    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
    """

    default_input_names = "sequences"

    def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        self._args_parser = args_parser

    def _sanitize_parameters(
        self,
        ignore_labels=None,
        grouped_entities: Optional[bool] = None,
        ignore_subwords: Optional[bool] = None,
        aggregation_strategy: Optional[AggregationStrategy] = None,
        offset_mapping: Optional[List[Tuple[int, int]]] = None,
        stride: Optional[int] = None,
    ):
        preprocess_params = {}
        if offset_mapping is not None:
            preprocess_params["offset_mapping"] = offset_mapping

        postprocess_params = {}
        if grouped_entities is not None or ignore_subwords is not None:
            if grouped_entities and ignore_subwords:
                aggregation_strategy = AggregationStrategy.FIRST
            elif grouped_entities and not ignore_subwords:
                aggregation_strategy = AggregationStrategy.SIMPLE
            else:
                aggregation_strategy = AggregationStrategy.NONE

            if grouped_entities is not None:
                warnings.warn(
                    "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"
                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
                )
            if ignore_subwords is not None:
                warnings.warn(
                    "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to"
                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
                )

        if aggregation_strategy is not None:
            if isinstance(aggregation_strategy, str):
                aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
            if (
                aggregation_strategy
                in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE}
                and not self.tokenizer.is_fast
            ):
                raise ValueError(
                    "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
                    ' to `"simple"` or use a fast tokenizer.'
                )
            postprocess_params["aggregation_strategy"] = aggregation_strategy
        if ignore_labels is not None:
            postprocess_params["ignore_labels"] = ignore_labels
        if stride is not None:
            if stride >= self.tokenizer.model_max_length:
                raise ValueError(
                    "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)"
                )
            if aggregation_strategy == AggregationStrategy.NONE:
                raise ValueError(
                    "`stride` was provided to process all the text but `aggregation_strategy="
                    f'"{aggregation_strategy}"`, please select another one instead.'
                )
            else:
                if self.tokenizer.is_fast:
                    tokenizer_params = {
                        "return_overflowing_tokens": True,
                        "padding": True,
                        "stride": stride,
                    }
                    preprocess_params["tokenizer_params"] = tokenizer_params
                else:
                    raise ValueError(
                        "`stride` was provided to process all the text but you're using a slow tokenizer."
                        " Please use a fast tokenizer."
                    )
        return preprocess_params, {}, postprocess_params

    def __call__(self, inputs: Union[str, List[str]], **kwargs):
        """
        Classify each token of the text(s) given as inputs.

        Args:
            inputs (`str` or `List[str]`):
                One or several texts (or one list of texts) for token classification.

        Return:
            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
            the following keys:

            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
              want to have the exact string in the original sentence, use `start` and `end`.
            - **score** (`float`) -- The corresponding probability for `entity`.
            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
              *aggregation_strategy* is not `"none"`.
            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
              token in the sentence.
            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
              exists if the offsets are available within the tokenizer
        """

        _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
        if offset_mapping:
            kwargs["offset_mapping"] = offset_mapping

        return super().__call__(inputs, **kwargs)

    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
        inputs = self.tokenizer(
            sentence,
            return_tensors=self.framework,
            truncation=truncation,
            return_special_tokens_mask=True,
            return_offsets_mapping=self.tokenizer.is_fast,
            **tokenizer_params,
        )
        inputs.pop("overflow_to_sample_mapping", None)
        num_chunks = len(inputs["input_ids"])

        for i in range(num_chunks):
            model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}

            if offset_mapping is not None:
                model_inputs["offset_mapping"] = offset_mapping
            model_inputs["sentence"] = sentence if i == 0 else None
            model_inputs["is_last"] = i == num_chunks - 1

            yield model_inputs

    def _forward(self, model_inputs):
        # Forward
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        offset_mapping = model_inputs.pop("offset_mapping", None)
        sentence = model_inputs.pop("sentence")
        is_last = model_inputs.pop("is_last")
        if self.framework == "tf":
            logits = self.model(**model_inputs)[0]
        else:
            output = self.model(**model_inputs)
            # print("Model output in _forward: ", output)
            logits = output["logits"] if isinstance(output, dict) else output[0]

        return {
            "logits": logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            "is_last": is_last,
            **model_inputs,
        }

    def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
        # print("postprocess all_outputs: ", all_outputs)
        if ignore_labels is None:
            ignore_labels = ["O"]
        all_entities = []
        for model_outputs in all_outputs:
            logits = model_outputs["logits"][0].numpy()
            sentence = all_outputs[0]["sentence"]
            input_ids = model_outputs["input_ids"][0]
            offset_mapping = (
                model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
            )
            special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()

            # maxes = np.max(logits, axis=-1, keepdims=True)
            # shifted_exp = np.exp(logits - maxes)
            # scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
            scores = logits
            # print("postprocess scores: ", scores)

            if self.framework == "tf":
                input_ids = input_ids.numpy()
                offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None

            pre_entities = self.gather_pre_entities(
                sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
            )

            # print("postprocess pre_entities: ", pre_entities)

            grouped_entities = self.aggregate(pre_entities, aggregation_strategy)

            # print("postprocess grouped_entities: ", grouped_entities)

            # Filter anything that is in self.ignore_labels
            entities = [
                entity
                for entity in grouped_entities
                if entity.get("entity", None) not in ignore_labels
                and entity.get("entity_group", None) not in ignore_labels
            ]
            all_entities.extend(entities)
        num_chunks = len(all_outputs)
        if num_chunks > 1:
            all_entities = self.aggregate_overlapping_entities(all_entities)
        return all_entities

    def aggregate_overlapping_entities(self, entities):
        if len(entities) == 0:
            return entities
        entities = sorted(entities, key=lambda x: x["start"])
        aggregated_entities = []
        previous_entity = entities[0]
        for entity in entities:
            if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
                current_length = entity["end"] - entity["start"]
                previous_length = previous_entity["end"] - previous_entity["start"]
                if current_length > previous_length:
                    previous_entity = entity
                elif current_length == previous_length and entity["score"] > previous_entity["score"]:
                    previous_entity = entity
            else:
                aggregated_entities.append(previous_entity)
                previous_entity = entity
        aggregated_entities.append(previous_entity)
        return aggregated_entities

    def gather_pre_entities(
        self,
        sentence: str,
        input_ids: np.ndarray,
        scores: np.ndarray,
        offset_mapping: Optional[List[Tuple[int, int]]],
        special_tokens_mask: np.ndarray,
        aggregation_strategy: AggregationStrategy,
    ) -> List[dict]:
        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
        pre_entities = []
        for idx, token_scores in enumerate(scores):
            # Filter special_tokens
            if special_tokens_mask[idx]:
                continue

            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
            if offset_mapping is not None:
                start_ind, end_ind = offset_mapping[idx]
                if not isinstance(start_ind, int):
                    if self.framework == "pt":
                        start_ind = start_ind.item()
                        end_ind = end_ind.item()
                word_ref = sentence[start_ind:end_ind]
                if getattr(self.tokenizer, "_tokenizer", None) and getattr(
                    self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
                ):
                    # This is a BPE, word aware tokenizer, there is a correct way
                    # to fuse tokens
                    is_subword = len(word) != len(word_ref)
                else:
                    # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
                    if aggregation_strategy in {
                        AggregationStrategy.FIRST,
                        AggregationStrategy.AVERAGE,
                        AggregationStrategy.MAX,
                    }:
                        warnings.warn(
                            "Tokenizer does not support real words, using fallback heuristic",
                            UserWarning,
                        )
                    is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]

                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
                    word = word_ref
                    is_subword = False
            else:
                start_ind = None
                end_ind = None
                is_subword = False

            pre_entity = {
                "word": word,
                "scores": token_scores,
                "start": start_ind,
                "end": end_ind,
                "index": idx,
                "is_subword": is_subword,
            }
            pre_entities.append(pre_entity)
        return pre_entities

    def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
        if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
            entities = []
            for pre_entity in pre_entities:
                # entity_idx = pre_entity["scores"].argmax()

                # print("pre_entity scores in aggregate: ", pre_entity["scores"])
                # print("entity_idx in aggregate: ", entity_idx)

                # score = pre_entity["scores"][entity_idx]
                entity = {
                    "entity": self.model.config.id2label[pre_entity["scores"]],
                    "score": pre_entity["scores"],
                    # "score": score,
                    "index": pre_entity["index"],
                    "word": pre_entity["word"],
                    "start": pre_entity["start"],
                    "end": pre_entity["end"],
                }
                entities.append(entity)
        else:
            entities = self.aggregate_words(pre_entities, aggregation_strategy)

        if aggregation_strategy == AggregationStrategy.NONE:
            return entities
        return self.group_entities(entities)

    def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict:
        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
        if aggregation_strategy == AggregationStrategy.FIRST:
            scores = entities[0]["scores"]
            idx = scores.argmax()
            score = scores[idx]
            entity = self.model.config.id2label[idx]
        elif aggregation_strategy == AggregationStrategy.MAX:
            max_entity = max(entities, key=lambda entity: entity["scores"].max())
            scores = max_entity["scores"]
            idx = scores.argmax()
            score = scores[idx]
            entity = self.model.config.id2label[idx]
        elif aggregation_strategy == AggregationStrategy.AVERAGE:
            scores = np.stack([entity["scores"] for entity in entities])
            average_scores = np.nanmean(scores, axis=0)
            entity_idx = average_scores.argmax()
            entity = self.model.config.id2label[entity_idx]
            score = average_scores[entity_idx]
        else:
            raise ValueError("Invalid aggregation_strategy")
        new_entity = {
            "entity": entity,
            "score": score,
            "word": word,
            "start": entities[0]["start"],
            "end": entities[-1]["end"],
        }
        return new_entity

    def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
        """
        Override tokens from a given word that disagree to force agreement on word boundaries.

        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
        company| B-ENT I-ENT
        """
        if aggregation_strategy in {
            AggregationStrategy.NONE,
            AggregationStrategy.SIMPLE,
        }:
            raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")

        word_entities = []
        word_group = None
        for entity in entities:
            if word_group is None:
                word_group = [entity]
            elif entity["is_subword"]:
                word_group.append(entity)
            else:
                word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
                word_group = [entity]
        # Last item
        if word_group is not None:
            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
        return word_entities

    def group_sub_entities(self, entities: List[dict]) -> dict:
        """
        Group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        """
        # Get the first entity in the entity group
        entity = entities[0]["entity"].split("-")[-1]
        scores = np.nanmean([entity["score"] for entity in entities])
        tokens = [entity["word"] for entity in entities]

        entity_group = {
            "entity_group": entity,
            "score": np.mean(scores),
            "word": self.tokenizer.convert_tokens_to_string(tokens),
            "start": entities[0]["start"],
            "end": entities[-1]["end"],
        }
        return entity_group

    def get_tag(self, entity_name: str) -> Tuple[str, str]:
        if entity_name.startswith("B-"):
            bi = "B"
            tag = entity_name[2:]
        elif entity_name.startswith("I-"):
            bi = "I"
            tag = entity_name[2:]
        else:
            # It's not in B-, I- format
            # Default to I- for continuation.
            bi = "I"
            tag = entity_name
        return bi, tag

    def group_entities(self, entities: List[dict]) -> List[dict]:
        """
        Find and group together the adjacent tokens with the same entity predicted.

        Args:
            entities (`dict`): The entities predicted by the pipeline.
        """

        entity_groups = []
        entity_group_disagg = []

        for entity in entities:
            if not entity_group_disagg:
                entity_group_disagg.append(entity)
                continue

            # If the current entity is similar and adjacent to the previous entity,
            # append it to the disaggregated entity group
            # The split is meant to account for the "B" and "I" prefixes
            # Shouldn't merge if both entities are B-type
            bi, tag = self.get_tag(entity["entity"])
            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])

            if tag == last_tag and bi != "B":
                # Modify subword type to be previous_type
                entity_group_disagg.append(entity)
            else:
                # If the current entity is different from the previous entity
                # aggregate the disaggregated entity group
                entity_groups.append(self.group_sub_entities(entity_group_disagg))
                entity_group_disagg = [entity]
        if entity_group_disagg:
            # it's the last entity, add it to the entity groups
            entity_groups.append(self.group_sub_entities(entity_group_disagg))

        return entity_groups

### Utilities

In [None]:
ner_pipe = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", stride=0, pipeline_class=CustomTokenClassificationPipeline)

In [None]:
ner_pipe("Paciente varón de 35 años con tumoración en polo superior de teste derecho hallada de manera casual durante una autoexploración, motivo por el cual acude a consulta de urología donde se realiza exploración física, apreciando masa de 1cm aproximado de diámetro dependiente de epidídimo, y ecografía testicular, que se informa como lesión nodular sólida en cabeza de epidídimo derecho.")

In [None]:
# group annotations around a clinical procedure mention, based on the annotation label
def group_annotations_strict(annotations):
  groups = []
  i = 0
  while i < len(annotations):
    if annotations[i]['entity_group'] == 'LABEL_0':
      i += 1
      continue

    group = [] # for the strict strategy, a group is a B (or many Bs), followed by 1 or more Is
    if annotations[i]['entity_group'] == 'LABEL_1':
      group.append(annotations[i])
      i += 1

      while (i < len(annotations) and annotations[i]['entity_group'] == 'LABEL_1'):
          group.append(annotations[i])
          i += 1

      while (i < len(annotations) and annotations[i]['entity_group'] == 'LABEL_2'):
          group.append(annotations[i])
          i += 1

      groups.append(group)
    else:
      i+=1
      continue

  return groups

In [None]:
# merge grouped annotations to form a complete entity mention
def merge_annotation_group_entries(annotation_group):
  start = annotation_group[0]['start']
  end = annotation_group[len(annotation_group) - 1]['end']
  text = ' '.join(annotation['word'] for annotation in annotation_group)
  return {'start': start, 'end': end, 'text': text}

In [None]:
def get_mentions(sentence):
  annotation_groups = group_annotations_strict(ner_pipe(sentence))
  return [merge_annotation_group_entries(group) for group in annotation_groups]

In [None]:
get_mentions("En la última realizada (4 años después del diagnóstico) se aprecia alteración en la morfología del polo inferior del riñón izquierdo con disminución de la cortical y calcificación abigarrada asociada en el parénquima, todo ello en relación con su diagnóstico de TBC.")

In [None]:
import pandas as pd

offsets = pd.read_csv(offsets_path)
offsets.set_index("file_name", inplace=True)

def apply_offset_to_mentions(mentions, file_name):
  for mention in mentions:
    offset = offsets.loc[file_name]["start_offset"]
    mention["start"] += offset
    mention["end"] += offset

In [None]:
def save_mentions_tsv(processed_annotations, file_name):
  file_name = file_name.rstrip(".txt")

  filenames = []
  labels = []
  start_spans = []
  end_spans = []
  texts = []
  for annotation in processed_annotations:
    filenames.append(file_name)
    labels.append("SINTOMA")
    start_spans.append(annotation["start"])
    end_spans.append(annotation["end"])
    texts.append(annotation["text"])

  df = pd.DataFrame(data={"filename": filenames, "label": labels, "start_span": start_spans, "end_span": end_spans, "text": texts })
  df.to_csv(f"{output_per_file_path}/{file_name}.tsv", sep="\t", index=False)

### Processing

In [None]:
file_names = list(filter(lambda file_name: file_name.endswith(".txt"), os.listdir(data_path)))
len(file_names)

In [None]:
already_processed_files = [file_name.strip('.tsv') for file_name in os.listdir(output_per_file_path)]

for file_name in file_names:
  print(file_name)
  file_name_no_ext = file_name.rstrip(".txt")

  if file_name_no_ext in already_processed_files: # this helps in case processing has been interrupted
    continue

  mentions_in_file = []

  with open(f"{data_path}/{file_name}") as txt_file:
    for line in txt_file.readlines():
      if not line:
        continue

      mentions_in_file.extend(get_mentions(line))

    # after processing all lines
    apply_offset_to_mentions(mentions_in_file, file_name)
    save_mentions_tsv(mentions_in_file, file_name)

In [None]:
# restore the original mention text from the source report, as encoding errors may occur when merging tokens in the pipeline
import pandas as pd

def restore_mention_text(row):
  txt_file = f"{original_texts_path}/{row['filename']}.txt"

  with open(txt_file, "r", encoding="utf-8") as src_file:
    content = src_file.read()
    return content[row["start_span"]:row["end_span"]]

In [None]:
# merge data frames for the separate files into a single result df
dfs = []

for filename in os.listdir(output_per_file_path):
  df = pd.read_csv(f"{output_per_file_path}/{filename}", index_col=None, header=0, sep="\t")
  dfs.append(df)

results_df = pd.concat(dfs, axis=0, ignore_index=True)

results_df["filename"] = results_df.apply(lambda row: row["filename"].split("-b-")[0], axis=1)
results_df["text"] = results_df.apply(restore_mention_text, axis=1)
results_df.to_csv(f"{output_root_path}/results.tsv", sep="\t", index=False)