<a href="https://colab.research.google.com/github/wallybeamm/neural_entity_recognition/blob/feature%2Finit/enitity_recognation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Entity Recognotion

## Academic Background

## Practical Implementations

### Set parameters

In [None]:
json_path = '/content/train.jsonl'
model_name = 'en_core_web_trf'

output_dir = "/content/Model"
n_iter = 100
learn_rate=2e-5


### Install missing packages

In [None]:
!python -m spacy download en_core_web_trf
!pip3 install -qU wandb


### Import librariers

In [None]:
import os
from pathlib import Path
import wandb
import spacy
from spacy.tokens import DocBin
import json
import random
from spacy.training.example import Example
import thinc
import torch
from spacy.util import minibatch
from tqdm.auto import tqdm
import unicodedata
import wasabi
import numpy
from collections import Counter
import gc 
from spacy.scorer import Scorer


### Initiliaze Loggers

In [None]:
wandb.login()
# W&B Artifact naming convention: `wandb_entity/wandb_project/artifact_name:version`
spacy_artifact = 'wandb/spacy/spacy_demo:v3'

# Our output directory name
spacy_dir = Path("my_spacy_demo") 

with wandb.init(project='spacy_demo') as run: # "config" is optional here
    artifact = run.use_artifact(spacy_artifact)
    _ = artifact.download(spacy_dir)

##Train the model by using Spacy's function

### Reformat dataset for Spacy's train function

In [None]:

def load_dataset(path):
    data = []
    for line in open(path, 'r', encoding="utf-8"):
        line_dict = json.loads(line)
        data.append((line_dict['data'].replace('\n', ' '), line_dict['label']))
    return data


nlp = spacy.blank("en")
training_data = load_dataset('./train.jsonl')
# the DocBin will store the example documents
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    ents = []
    #print(annotations)
    for start, end, label in annotations:
        if label == '' or label == None:
          continue

        span = doc.char_span(start, end, label=label)
        if span == None:
          continue
        print(span)
        ents.append(span)
    #print(ents)
    
    doc.ents = ents
    db.add(doc)
db.to_disk("./train.spacy")

### Create the config file

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Investigate the dataset

In [None]:
!python -m spacy debug data config.cfg

[1m
Downloading config.json: 100% 481/481 [00:00<00:00, 445kB/s]
Downloading vocab.json: 100% 878k/878k [00:00<00:00, 5.12MB/s]
Downloading merges.txt: 100% 446k/446k [00:00<00:00, 3.12MB/s]
Downloading tokenizer.json: 100% 1.29M/1.29M [00:00<00:00, 6.39MB/s]
Downloading pytorch_model.bin: 100% 478M/478M [00:06<00:00, 75.0MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Bert

### Train

In [None]:
!python -m spacy train config.cfg -o my_spacy_demo/training/cnn --gpu-id 0

[38;5;2m✔ Created output directory: my_spacy_demo/training/cnn[0m
[38;5;4mℹ Saving to output directory: my_spacy_demo/training/cnn[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-10-22 19:36:50,967] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-10-22 19:36:51,748] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2022-10-22 19:36:51,753] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-10-22 19:36:51,755] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceCla

## Define our functions for training

In [None]:
# Load the dataset

def load_dataset(path):

  data = []
  for line in open(path, 'r'):
      line_dict = json.loads(line)
      data.append((line_dict['data'].replace('\n', ' '), line_dict['label']))
  return data

# Display entity info
def show_ents(doc): 
  spacy.displacy.render(doc, style="ent", jupyter=True) # if from notebook else displacy.serve(doc, style="ent") generally

def cyclic_triangular_rate(min_lr, max_lr, period):
    it = 1
    while True:
        # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
        cycle = numpy.floor(1 + it / (2 * period))
        x = numpy.abs(it / period - 2 * cycle + 1)
        relative = max(0, 1 - x)
        yield min_lr + (max_lr - min_lr) * relative
        it += 1

def train(data, model):
  # Main
  from thinc.api import set_gpu_allocator, require_gpu

  # Default scoring pipeline
  scorer = Scorer()




  # Use the GPU, with memory allocations directed via PyTorch.
  # This prevents out-of-memory errors that would otherwise occur from competing
  # memory pools.

  set_gpu_allocator("pytorch")
  if "ner" not in model.pipe_names:
      ner = model.create_pipe("ner") # "architecture": "ensemble" simple_cnn ensemble, bow # https://spacy.io/api/annotation
      model.add_pipe(ner)
  else:
      ner = nlp.get_pipe("ner")

  # Update the label list
  for annotations in data:
      for ent in annotations[1]:
          ner.add_label(ent[2])

  learn_rates = cyclic_triangular_rate(
    learn_rate / 3, learn_rate * 3, 2 * len(train_data) // 1
    )

  with model.select_pipes(enable=['ner', 'transformer']):  # only train NER
      optimizer = model.resume_training()
      i = 0
      for itn in range(n_iter):
        
          random.shuffle(train_data)
          losses = {}
          batches = spacy.util.minibatch(train_data, size=8)
          for batch in batches:
              for text, annotations in batch:
                  print(text)
                  print(annotations)
                  # create Example 
                  #cupy.get_default_memory_pool().free_all_blocks()              
                  doc = model.make_doc(text)
                  annotations = {'entities' : annotations}
                  example = Example.from_dict(doc, annotations)
                  # try to visualize the content of the example

                  # Update the model
                  #print('Example')
                  #print(example)
                  #print('doc')
                  #print(doc)
                  #print(len(doc))
                  #print('annotations')
                  #print(annotations)
                  #print(len(annotations))
                  # 100 Mbi Gpu/Memory
                  

                  #i = i + 1
                  #print(i)
                  model.update([example], sgd=optimizer, drop=0.1, losses=losses ) # Be sure that you are defining batch size
                  #if output_dir is not None:
                  #  model.to_disk(output_dir)
                  #  print("Saved model to", output_dir)
                  #torch.cuda.empty_cache()
                  #gc.collect()
                  #torch.cuda.empty_cache()
                  #del model
                  #model = spacy.load(output_dir)

              scorer = Scorer(model)
              scores = scorer.score([example])
              print(scores)

                  

  return model

def split(data, train_percantage):
  # Split the data
  train_lenght = int(len(data)*train_percantage)
  train_data = data[:train_lenght]
  test_data = data[train_lenght:]
  return train_data, test_data

def test(test_data, model):
  for text, _ in test_data:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

def save_model(model, output_dir):
  if output_dir is not None:
      nlp.to_disk(output_dir)
      print("Saved model to", output_dir)


In [None]:
# Main
from thinc.api import set_gpu_allocator, require_gpu

# Use the GPU, with memory allocations directed via PyTorch.
# This prevents out-of-memory errors that would otherwise occur from competing
# memory pools.#
set_gpu_allocator("pytorch")
require_gpu(0)
data = load_dataset(json_path)

nlp = spacy.load(model_name)

train_data, test_data = split(data, 1)
#nlp.max_length = 100000
#nlp.max_split_size_mb = 100
finetuned_model = train(train_data, nlp)

if output_dir is not None:
    finetuned_model.to_disk(output_dir)
    print("Saved model to", output_dir)