# Google Colab

In [None]:
# %%capture
# # to install required libraries in Google Colabture
# ! pip install transformers
# ! pip install datasets
# ! pip install seqeval
# ! pip install mendelai-brat-parser
# ! pip install spacy==3.2.0
# ! pip install nervaluate

# ! python -m spacy download en_core_web_sm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Libraries

In [None]:
import pandas as pd
import re

import prepare_data_utils
from model import NERTaisti
from mappings import ENTITIES_MAP

In [None]:
# TODO: speed-up the process with multiprocessing

train_recipe_paths = [f"annotations/{idx}.txt" for idx in range(240)]
train_ann_paths = [f"annotations/{idx}.ann" for idx in range(240)]
val_recipe_paths = [f"annotations/{idx}.txt" for idx in range(240, 300)]
val_ann_paths = [f"annotations/{idx}.ann" for idx in range(240, 300)]

train_recipes, train_entities = prepare_data_utils.collect_recipes_with_annotations(
    annotations_paths=train_ann_paths, recipes_paths=train_recipe_paths,
    scheme_func=prepare_data_utils.bio_scheme,
    map_entity_func=prepare_data_utils.map_entity,
    entities_map=ENTITIES_MAP,
    choose_span_func=prepare_data_utils.choose_food_span
)

val_recipes, val_entities = prepare_data_utils.collect_recipes_with_annotations(
    annotations_paths=val_ann_paths, recipes_paths=val_recipe_paths,
    scheme_func=prepare_data_utils.bio_scheme,
    map_entity_func=prepare_data_utils.map_entity,
    entities_map=ENTITIES_MAP,
    choose_span_func=prepare_data_utils.choose_food_span
)

# Train

In [None]:
# TODO: config should be a json file

CONFIG = {
    "bert_type": "bert-base-cased",
    "model_pretrained_path": "",
    "max_length": 128,
    "only_first_token": True,

    # for more details see https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/trainer#transformers.TrainingArguments
    "training_args": {
        "output_dir": '/checkpoints',
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 10,
        "weight_decay": 0.01,
        "load_best_model_at_end": True,
        "seed": 62
    }
}

model = NERTaisti(config=CONFIG)

In [None]:
model.train(train_recipes, train_entities, val_recipes, val_entities)

# Evaluate

In [None]:
CONFIG = {
    "bert_type": "bert-base-cased",
    "model_pretrained_path": "../res/ner_model/",  # load pretrained model
    "max_length": 128,
    "only_first_token": True,

    # for more details see https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/trainer#transformers.TrainingArguments
    "training_args": {
        "output_dir": './checkpoints',
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 10,
        "weight_decay": 0.01,
        "load_best_model_at_end": True,
        "seed": 62
    }
}

model = NERTaisti(config=CONFIG)

In [None]:
results = model.evaluate(val_recipes, val_entities)

In [None]:
results

# Prediction

In [None]:
CONFIG = {
    "bert_type": "bert-base-cased",
    "model_pretrained_path": "../res/ner_model",  # load pretrained model
    "max_length": 128,
    "only_first_token": True,

    # for more details see https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/trainer#transformers.TrainingArguments
    "training_args": {
        "output_dir": './checkpoints',
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 10,
        "weight_decay": 0.01,
        "load_best_model_at_end": True,
        "seed": 62
    }
}

model = NERTaisti(config=CONFIG)

In [None]:
pred_entities = model.predict(val_recipes)

In order to analyse predictions you can analyse the following DataFrame

In [None]:
def flatten_list(deep_list):

  flat_list = []
  for el in deep_list:
      flat_list.extend(el)

  return flat_list

In [None]:
results = pd.DataFrame({
    "recipe_idx": flatten_list(
        [[re.findall(r"\d+", ann_path)[-1]] * len(val_recipes[i]) for i, ann_path in enumerate(val_ann_paths)]
        ),
    "word": flatten_list(val_recipes),
    "true_entity": flatten_list(val_entities),
    "pred_entity": flatten_list(pred_entities)
})

In [None]:
results.to_csv("manual_review_val_set.csv", index=False)

If you want to get predictions for files that has not been annotated so far

In [None]:
recipe_paths = [f"annotations/{idx}.txt" for idx in range(240, 300)]

recipes = prepare_data_utils.collect_recipes_without_annotations(
    recipes_paths=recipe_paths
)

In [None]:
pred_entities = model.predict(recipes)