# Google Colab

In [None]:
# %%capture
# # uncomment to install required libraries in Google Colabture
# ! pip install transformers
# ! pip install datasets
# ! pip install seqeval
# ! pip install mendelai-brat-parser
# ! pip install spacy==3.2.0
# ! pip install nervaluate

# ! python -m spacy download en_core_web_sm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# cd /content/drive/MyDrive/TAISTI/WP1/NER

# Libraries

In [None]:
import pandas as pd
import re

import prepare_data_utils
from model import NERTaisti

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
ALL_ENTITIES_HIERARCHY = [
    "quantity",
    "unit",
    "food_product_with_unit",
    "food_product_without_unit_countable",
    "food_product_without_unit_uncountable",
    "food_product_whole",
    "process",
    "physical_quality",
    "color",
    "trade_name",
    "example",
    "taste",
    "purpose",
    "diet",
    "part",
    "possible_substitute",
    "excluded",
    "exclusive"
]

In [None]:
ENTITIES_MAP = {
    entity: "O" if "food" not in entity else "FOOD"
    for entity in ALL_ENTITIES_HIERARCHY
}

for entity in ["quantity", "unit", "color", "physical_quality", "process"]:
    ENTITIES_MAP[entity] = entity.upper()

In [None]:
# TODO: speed-up the process with multiprocessing
choose_span_func = prepare_data_utils.choose_food_span
entities_map = ENTITIES_MAP
entity_hierarchy = ALL_ENTITIES_HIERARCHY

train_indices = list(range(240)) + list(range(300, 400)) + list(range(500, 600))
val_indices = list(range(240, 300))
missing_indices = [417, 443]
test_indices = [idx for idx in range(400, 500) if idx not in missing_indices]

train_recipe_paths = [f"annotations/{idx}.txt" for idx in train_indices]
train_ann_paths = [f"annotations/{idx}.ann" for idx in train_indices]
val_recipe_paths = [f"annotations/{idx}.txt" for idx in val_indices]
val_ann_paths = [f"annotations/{idx}.ann" for idx in val_indices]
test_recipe_paths = [f"annotations/{idx}.txt" for idx in test_indices]
test_ann_paths = [f"annotations/{idx}.ann" for idx in test_indices]

train_recipes, train_entities = prepare_data_utils.collect_recipes_with_annotations(
    annotations_paths=train_ann_paths, recipes_paths=train_recipe_paths,
    scheme_func=prepare_data_utils.bio_scheme,
    map_entity_func=prepare_data_utils.map_entity,
    entities_map=entities_map,
    choose_span_func=choose_span_func,
    entity_hierarchy=entity_hierarchy
)

val_recipes, val_entities = prepare_data_utils.collect_recipes_with_annotations(
    annotations_paths=val_ann_paths, recipes_paths=val_recipe_paths,
    scheme_func=prepare_data_utils.bio_scheme,
    map_entity_func=prepare_data_utils.map_entity,
    entities_map=entities_map,
    choose_span_func=choose_span_func,
    entity_hierarchy=entity_hierarchy
)


test_recipes, test_entities = prepare_data_utils.collect_recipes_with_annotations(
    annotations_paths=test_ann_paths, recipes_paths=test_recipe_paths,
    scheme_func=prepare_data_utils.bio_scheme,
    map_entity_func=prepare_data_utils.map_entity,
    entities_map=entities_map,
    choose_span_func=choose_span_func,
    entity_hierarchy=entity_hierarchy
)

# Train

In [None]:
# cross-validation was used hence we can train on everything
recipes = train_recipes + val_recipes + test_recipes
entities = train_entities + val_entities + test_entities

In [None]:
label2id = {"O": 0}
idx = 1

for entity in set(list(entities_map.values())):
    label2id[f"B-{entity}"] = idx
    idx += 1
    label2id[f"I-{entity}"] = idx
    idx += 1

label2id = {k: v for k, v in sorted(label2id.items(), key=lambda item: item[1])}

In [None]:
# TODO: config should be a json file

CONFIG = {
    "_name_or_path": "bert-base-cased",
    "model_pretrained_path": "",
    "save_dir": "../res/bert-base-cased",  # or any choice
    "num_of_tokens": 128,
    "only_first_token": True,

    # for more details see https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/trainer#transformers.TrainingArguments
    "training_args": {
        "output_dir": '../checkpoints',
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "save_total_limit": 2,
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 10,
        "weight_decay": 0.01,
        "load_best_model_at_end": True,
        "seed": 62
    },

    "label2id" : label2id
}

model = NERTaisti(config=CONFIG)

In [None]:
model.train(train_recipes, train_entities, val_recipes, val_entities)

# Evaluate

In [None]:
model = NERTaisti(config="../res/ner_model/config.json")

In [None]:
results = model.evaluate(val_recipes, val_entities)
results

# Prediction

In [None]:
model = NERTaisti(config="../res/ner_model/config.json")

In [None]:
pred_entities = model.predict(val_recipes)

In order to analyse predictions you can analyse the following DataFrame

In [None]:
def flatten_list(deep_list):

  flat_list = []
  for el in deep_list:
      flat_list.extend(el)

  return flat_list

In [None]:
results = pd.DataFrame({
    "recipe_idx": flatten_list(
        [[re.findall(r"\d+", ann_path)[-1]] * len(val_recipes[i]) for i, ann_path in enumerate(val_ann_paths)]
        ),
    "word": flatten_list(val_recipes),
    "true_entity": flatten_list(val_entities),
    "pred_entity": flatten_list(pred_entities)
})

In [None]:
results.to_csv("manual_review_val_set.csv", index=False)

If you want to get predictions for files that has not been annotated so far

In [None]:
recipe_paths = [f"annotations/{idx}.txt" for idx in range(240, 300)]

recipes = prepare_data_utils.collect_recipes_without_annotations(
    recipes_paths=recipe_paths
)

In [None]:
pred_entities = model.predict(recipes)