# Named Entity Annotation with BERT Transformers

The cookbook loads a datasets of social media posts and news articles in JSON form and then applies BERT based named entity recognition to the texts. Based on the URL of the original post, the script also retrieves the full body from the website for parsing with BS4.

In [None]:
#%pip install --upgrade transformers beautifulsoup4

In [2]:
#Load Huggingface data
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import LukeForEntityClassification
from transformers import pipeline
import json, random

In [3]:
#Data Augentation
from bs4 import BeautifulSoup
import urllib
from urllib.request import Request
from urllib.error import HTTPError, URLError
import socket

In [30]:
# This script supports append mode in case the file is long. 
# For testing it is recommended to use a sample
append_mode=False
sample_only=True
sample_size=5

### Data sources

In [16]:
#INPUT file 
input_file="ner_pre_dataset20230925.json"

#OUTPUT file - A result looks like this https://github.com/ternary-ai/datasets/blob/main/ner_sentences_2.5M_en_sample10K.csv
output_file = 'ner_annotated_2.5M.json'



In [18]:
#BS4 page parser functions
def get_page(url):
    """Scrapes a URL and returns the HTML source.

    Args:
        url (string): Fully qualified URL of a page.

    Returns:
        soup (string): HTML source of scraped page.
    """
    #print(url)
    #return
    req = Request(f"{json.loads(url)}", headers={'User-Agent': 'Mozilla/5.0'})

    soup=None
    try:
        response = urllib.request.urlopen(req, timeout=10).read().decode('utf-8')
    except HTTPError as error:
        print('Data not retrieved because %s\nURL: %s', error, url)
        return False
    except URLError as error:
        if isinstance(error.reason, socket.timeout):
            print('socket timed out - URL %s', url)
            return False
        else:
            print('some other error happened %s ' % error)
            return False
    else:
        
        try:
            soup = BeautifulSoup(response,
                         'html.parser',
                         from_encoding=response.info().get_param('charset'))
        except:
            soup = BeautifulSoup(response,
                                 'html.parser')
            return False
        
        element = soup.find('body')

        text_content = element.get_text(' | ',strip=True)

    return text_content


In [19]:
def write_json(new_data, filename='data.json'):

    with open(filename,'r+') as file:
          # First we load existing data into a dict.
        file_data = json.load(file)
        # Join new_data with file_data inside emp_details
        file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        #json.dump(file_data, file, indent = 4)
        json.dump(file_data, file, indent = 4, default=str)


In [20]:
with open(input_file, 'r') as json_file:
    data = json.load(json_file)

In [21]:
# Print the loaded data
print(f" data has {len(data)} observations")


 data has 257648 observations


### Set random index and run the model

In [None]:
random_index = random.randint(0, len(data) - 1)

In [None]:
data[random_index]

In [23]:
#Load Models    
model_name="dslim/bert-large-NER"

In [26]:
#Load AutoTokenizer this might take a while
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


model.safetensors:  84%|########4 | 1.12G/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
#Setup pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer,device=0)


In [37]:
#Take a randome sample from the data set
full_data=[]
if sample_only:
    tdata=data[random_index:random_index+sample_size]
else:
    tdata=data


In [38]:
len(tdata)

5

In [45]:
#Now execute the loop given the dataset
for index, entry in enumerate(tdata):
    
    print(f" processing {index+1}/{len(tdata)} => {round(((index+1)/len(tdata)*100),5)}% complete", end='\r', flush=True)
    source_value_lower = entry['_source'].lower()

    if 'youtu' in source_value_lower or 'redd.it' in source_value_lower or 'reddit' in source_value_lower or 'twitter' in source_value_lower or 'finclout' in source_value_lower:
        #print(" **** processing social **** \n")
        _content= entry['_content']
    else:
        try:
            cTmp=get_page(entry['_url'])
            if cTmp:
                _content= cTmp
            else:
                _content= entry['_content']

        except Exception as e:
            print(f"***** BS4 failed for URL: **** => {entry['_url']} *****")
            _content= entry['_content']

    
    #Here is the NER model appending the identified entities
    ner_results = nlp(entry['_title'])
    entry.update({"_tEntities":ner_results})

    ner_results = nlp(_content)
    entry.update({"_cEntities":ner_results})
    entry.update({"_expContent":_content})

    full_data.append(entry)

    if append_mode:
        write_json(entry,output_file)

   

 processing 5/5 => 100.0% complete

{"entity": "I-PER", "score": "0.98287416", "index": 6, "word": "##asa", "start": 11, "end": 14}, 
{"entity": "I-PER", "score": "0.971494", "index": 7, "word": "##nt", "start": 14, "end": 16}, 
{"entity": "I-PER", "score": "0.9994259", "index": 8, "word": "P", "start": 17, "end": 18}, 
{"entity": "I-PER", "score": "0.9972548", "index": 9, "word": "##rab", "start": 18, "end": 21}, {"entity": "I-PER", "score": "0.99404114", "index": 10, "word": "##hu", "start": 21, "end": 23}, {"entity": "B-PER", "score": "0.8141747", "index": 31, "word": "Ali", "start": 126, "end": 129}

### Now store the dataset in a json file

In [46]:
# Write the array of dictionaries to a JSON file
if not append_mode:
    with open(output_file, 'w') as json_file:
        json.dump(full_data, json_file, default=str)
