Sets up the necessary dependencies and language models for using spaCy in Spanish and Catalan

In [None]:
!pip install -U spacy
!python -m spacy download es_core_news_md
!python -m spacy download ca_core_news_md

In [None]:
!pip install python-crfsuite

In [None]:
import spacy
import json
import pycrfsuite as crfs

Parse the JSON train and test files

In [None]:
def parse_json(file_path):

  # Step 1: Open the file in read mode
  try:
    with open(file_path, "r") as json_file:

      # Step 2: Load the JSON data using json.load()
      parsed_file = json.load(json_file)
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
  else:
    print("JSON data parsed successfully!")
    # Step 3: Access and process the data
    # (See examples below based on data structure)
  return parsed_file

In [None]:
training_set = parse_json("./train_data.json")
test_set = parse_json("./test_data.json")

JSON data parsed successfully!
JSON data parsed successfully!


In [None]:
print(len(training_set))

254


We will start by downloading a Spanish and Catalan language model that splits the texts in sentences and tags the wrods accordingly.

Upon thorough analysis of the data sets, we came accross some inconsistencies regarding tagging. For example, in some instances, the same word containing a "-" is being interpreted as two separate words and in other instances as one whole word.

One such example would be for the word "ex-fumador".

This inconsistency is creating errors in the parsing stage, so we will avoid the texts that contain such words.

In [None]:
skip_list = [16,45,106,116,119,120,163,173,177,179,198,215,216,226,229,230,236,243,248]

In [None]:
text_words_info = []
nlp = spacy.load('es_core_news_md')  # Load the Spanish language model
for i in range(len(training_set)):
  if i in skip_list:
    continue
  text = training_set[i]['data']['text']

  # Parse the text
  doc = nlp(text)

  # Extract sentences
  sentences = [sent.text for sent in doc.sents]
  print("text_number:", i+1)

  # Extract words with start and end positions, lemma, and POS tagging for each word
  words_info = [{"sent_idx":idx, "sent":sent,"word":token.text, "start":token.idx, "end":token.idx + len(token), "lemma":token.lemma_, "pos":token.pos_,"tag":' ',"prefix":token.prefix_,"suffix":token.suffix_}
                for idx,sent in enumerate(doc.sents) for token in sent if not token.is_space and token.text not in ["*", "(",",", "?",",","!",":",";",".","&","\",","/","-"]]
  text_words_info.append(words_info)

Preprocess the test data in a similar fashion.

In [None]:
skip_list_test = [3,13,21]

In [None]:
text_words_info_test = []
nlp = spacy.load('es_core_news_md')  # Load the Spanish language model
for i in range(len(test_set)):
  if i in skip_list_test:
    continue
  text = test_set[i]['data']['text']

  # Parse the text
  doc = nlp(text)

  # Extract sentences
  sentences = [sent.text for sent in doc.sents]
  print("text_number:", i+1)

  # Extract words with start and end positions, lemma, and POS tagging for each word
  words_info = [{"sent_idx":idx, "sent":sent,"word":token.text, "start":token.idx, "end":token.idx + len(token), "lemma":token.lemma_, "pos":token.pos_,"tag":' ',"prefix":token.prefix_,"suffix":token.suffix_}
                for idx,sent in enumerate(doc.sents) for token in sent if not token.is_space and token.text not in ["*", "(",",", "?",",","!",":",";",".","&","\",","/","-"]]
  text_words_info_test.append(words_info)

We discarded 8% of the train set and 5% of the test set.

In [None]:
print(len(text_words_info))

235


In [None]:
print(len(text_words_info_test))

61


 Extract and sort annotations labeled as "NEG", "NSCO", "UNC", and "USCO" from training set.

In [None]:
text_neg = []
text_nsco = []
text_unc = []
text_usco = []
for idx,document in enumerate(training_set):
  if idx in skip_list:
    continue
  negations = [result_element['value'] for result_element in document["predictions"][0]["result"] if "NEG" in result_element["value"]["labels"]]
  negations = sorted(negations, key=lambda x: x['start'])
  nsco = [result_element['value'] for result_element in document["predictions"][0]["result"] if "NSCO" in result_element["value"]["labels"]]
  nsco = sorted(nsco, key=lambda x: x['start'])
  unc = [result_element['value'] for result_element in document["predictions"][0]["result"] if "UNC" in result_element["value"]["labels"]]
  unc = sorted(unc, key=lambda x: x['start'])
  usco = [result_element['value'] for result_element in document["predictions"][0]["result"] if "USCO" in result_element["value"]["labels"]]
  usco = sorted(usco, key=lambda x: x['start'])
  text_neg.append(negations)
  text_nsco.append(nsco)
  text_unc.append(unc)
  text_usco.append(usco)



 Extract and sort annotations labeled as "NEG", "NSCO", "UNC", and "USCO" from test set.

In [None]:
text_neg_test = []
text_nsco_test = []
text_unc_test = []
text_usco_test = []
for idx,document in enumerate(test_set):
  if idx in skip_list_test:
    continue
  negations = [result_element['value'] for result_element in document["predictions"][0]["result"] if "NEG" in result_element["value"]["labels"]]
  negations = sorted(negations, key=lambda x: x['start'])
  nsco = [result_element['value'] for result_element in document["predictions"][0]["result"] if "NSCO" in result_element["value"]["labels"]]
  nsco = sorted(nsco, key=lambda x: x['start'])
  unc = [result_element['value'] for result_element in document["predictions"][0]["result"] if "UNC" in result_element["value"]["labels"]]
  unc = sorted(unc, key=lambda x: x['start'])
  usco = [result_element['value'] for result_element in document["predictions"][0]["result"] if "USCO" in result_element["value"]["labels"]]
  usco = sorted(usco, key=lambda x: x['start'])
  text_neg_test.append(negations)
  text_nsco_test.append(nsco)
  text_unc_test.append(unc)
  text_usco_test.append(usco)



Tagging words in a document with negation labels ("B-NEG", "I-NEG", "E-NEG").

For "NEG" - Train set

In [None]:
for negs, words_info in zip(text_neg, text_words_info):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):

    # Move to the next word if didn't find the beginning of a negation
    lst = ['exfumador','exfumadora','ex-fumador','ex-fumadora']
    if words_info[words_counter]['word'] in lst:
      words_info[words_counter]['tag'] = 'B-NEG'
      neg_counter+=1
      words_counter+=1
      continue
    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    # The beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-NEG'
      neg_counter+=1
      words_counter+=1

    # The negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-NEG'
      words_counter+=1

      # Tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
        words_info[words_counter]['tag'] = 'I-NEG' # comment this for ignoring inside tagging
        words_counter+=1

      # Tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-NEG'
        neg_counter+=1





For "NEG" - Test set

In [None]:
for negs, words_info in zip(text_neg_test, text_words_info_test):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):

    # Move to the next word if didn't find the beginning of a negation
    lst = ['exfumador','exfumadora','ex-fumador','ex-fumadora']
    if words_info[words_counter]['word'] in lst:
      words_info[words_counter]['tag'] = 'B-NEG'
      neg_counter+=1
      words_counter+=1
      continue
    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    # The beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-NEG'
      neg_counter+=1
      words_counter+=1

    # The negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-NEG'
      words_counter+=1

      # Tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
        words_info[words_counter]['tag'] = 'I-NEG' # comment this for ignoring inside tagging
        words_counter+=1

      # Tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-NEG'
        neg_counter+=1





Tagging words in a document with negation labels ("B-NSCO", "I-NSCO", "E-NSCO").

For "NSCO" - Train set

In [None]:
for negs, words_info in zip(text_nsco, text_words_info):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):

    #move to the next word if didn't find the beginning of a negation
    lst = ['exfumador','exfumadora','ex-fumador','ex-fumadora']
    if words_info[words_counter]['word'] in lst:
      words_info[words_counter]['tag'] = 'B-NSCO'
      neg_counter+=1
      words_counter+=1
      continue
    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    #the beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-NSCO'
      neg_counter+=1
      words_counter+=1

    #the negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-NSCO'
      words_counter+=1

      #tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:

        words_info[words_counter]['tag'] = 'I-NSCO' # comment this for ignoring inside tagging
        words_counter+=1

      #tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-NSCO'
        neg_counter+=1


For "NSCO" - Test Set

In [None]:
for negs, words_info in zip(text_nsco_test, text_words_info_test):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):

    #move to the next word if didn't find the beginning of a negation
    lst = ['exfumador','exfumadora','ex-fumador','ex-fumadora']
    if words_info[words_counter]['word'] in lst:
      words_info[words_counter]['tag'] = 'B-NSCO'
      neg_counter+=1
      words_counter+=1
      continue
    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    #the beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-NSCO'
      neg_counter+=1
      words_counter+=1

    #the negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-NSCO'
      words_counter+=1

      #tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:

        words_info[words_counter]['tag'] = 'I-NSCO' # comment this for ignoring inside tagging
        words_counter+=1

      #tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-NSCO'
        neg_counter+=1


Tagging words in a document with uncertainty labels ("B-UNC", "I-UNC", "E-UNC").

For "UNC" - Train Set

In [None]:
for negs, words_info in zip(text_unc, text_words_info):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):


    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    #the beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-UNC'
      neg_counter+=1
      words_counter+=1

    #the negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-UNC'
      words_counter+=1

      #tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
        words_info[words_counter]['tag'] = 'I-UNC' # comment this for ignoring inside tagging
        words_counter+=1

      #tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-UNC'
        neg_counter+=1

For "UNC" - Train Set

In [None]:
for negs, words_info in zip(text_unc_test, text_words_info_test):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):


    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    #the beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-UNC'
      neg_counter+=1
      words_counter+=1

    #the negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-UNC'
      words_counter+=1

      #tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
        words_info[words_counter]['tag'] = 'I-UNC' # comment this for ignoring inside tagging
        words_counter+=1

      #tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-UNC'
        neg_counter+=1

Tagging words in a document with uncertainty labels ("B-USCO", "I-USCO", "E-USCO").

For "USCO" - Train Set

In [None]:
for negs, words_info in zip(text_usco, text_words_info):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):

    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    #the beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-USCO'
      neg_counter+=1
      words_counter+=1

    #the negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-USCO'
      words_counter+=1

      #tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
        words_info[words_counter]['tag'] = 'I-USCO' # comment this for ignoring inside tagging
        words_counter+=1

      #tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-USCO'
        neg_counter+=1

For "USCO" - Test Set

In [None]:
for negs, words_info in zip(text_usco_test, text_words_info_test):
  neg_counter=0
  words_counter =0
  while neg_counter < len(negs):

    if abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) > 1:
      words_counter+=1

    #the beginning of a negation is the same as its end
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <=1:
      words_info[words_counter]['tag'] = 'B-USCO'
      neg_counter+=1
      words_counter+=1

    #the negation has more than one word
    elif abs(negs[neg_counter]['start'] - words_info[words_counter]['start']) <=1 and abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
      words_info[words_counter]['tag'] = 'B-USCO'
      words_counter+=1

      #tag the inside of the negation
      while abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) > 1:
        words_info[words_counter]['tag'] = 'I-USCO' # comment this for ignoring inside tagging
        words_counter+=1

      #tag the end of it
      if abs(negs[neg_counter]['end'] - words_info[words_counter]['end']) <= 1:
        words_info[words_counter]['tag'] = 'E-USCO'
        neg_counter+=1

Tag the word outside of any named entity as 'O' - Train set

In [None]:
for text in text_words_info:
  for word_info in text:
    if word_info['tag']==' ':
      word_info['tag'] = 'O'

Tag the word outside of any named entity as 'O' - Test set

In [None]:
for text in text_words_info_test:
  for word_info in text:
    if word_info['tag']==' ':
      word_info['tag'] = 'O'

In [None]:
for word_info in text_words_info[0]:
  print(word_info)

 Iterates through each document and groups words into sentences, appending each sentence to `text_sentences`. - Train set

In [None]:
text_sentences = []
for document in text_words_info:
  sentences = []
  sentence = []
  current_idx = 1
  for word in document:
    if word['sent_idx'] == current_idx:
      sentence.append(word)
    else:
      sentences.append(sentence)
      current_idx += 1
      sentence = [word]
  text_sentences.append(sentences)

 Iterates through each document and groups words into sentences, appending each sentence to `text_sentences_test`. - Test set

In [None]:
text_sentences_test = []
for document in text_words_info_test:
  sentences = []
  sentence = []
  current_idx = 1
  for word in document:
    if word['sent_idx'] == current_idx:
      sentence.append(word)
    else:
      sentences.append(sentence)
      current_idx += 1
      sentence = [word]
  text_sentences_test.append(sentences)

Flattens `text_sentences` into a single list of sentences - Train set

In [None]:
# Compute a list of sentences with every sentence of the text_sentences
sentences = []
for text in text_sentences:
  for sentence in text:
    words_info = []
    for word_info in sentence:
      del word_info['sent_idx']
      del word_info['sent']
      words_info.append(word_info)
    sentences.append(words_info)

Flattens `text_sentences` into a single list of sentences - Test set

In [None]:
# Compute a list of sentences with every sentence of the text_sentences
sentences_test = []
for text in text_sentences_test:
  for sentence in text:
    words_info = []
    for word_info in sentence:
      del word_info['sent_idx']
      del word_info['sent']
      words_info.append(word_info)
    sentences_test.append(words_info)

Processes a list of sentences to count the occurrences of different tags for each word. - Train set

In [None]:
total_words = 0
bneg = 0
bnsco = 0
ineg = 0
insco = 0
eneg = 0
ensco = 0
bunc = 0
busco = 0
iunc = 0
iusco = 0
eunc = 0
eusco = 0
o = 0

for sentence in sentences:
  for word in sentence:
    total_words += 1
    if word['tag'] == 'O':
      o += 1
    elif word['tag'] == 'B-NEG':
      bneg += 1
    elif word['tag'] == 'B-NSCO':
      bnsco += 1
    elif word['tag'] == 'B-UNC':
      bunc += 1
    elif word['tag'] == 'B-USCO':
      busco += 1
    elif word['tag'] == 'I-NEG':
      ineg += 1
    elif word['tag'] == 'I-NSCO':
      insco += 1
    elif word['tag'] == 'I-UNC':
      iunc += 1
    elif word['tag'] == 'I-USCO':
      iusco += 1
    elif word['tag'] == 'E-NEG':
      eneg += 1
    elif word['tag'] == 'E-NSCO':
      ensco += 1
    elif word['tag'] == 'E-UNC':
      eunc += 1
    elif word['tag'] == 'E-USCO':
      eusco += 1

print(f"Total words: {total_words}")

print("\nB-Tags:")
print(f"  B-NEG:  {bneg}")
print(f"  B-NSCO: {bnsco}")
print(f"  B-UNC:  {bunc}")
print(f"  B-USCO: {busco}")

print("\nI-Tags:")
print(f"  I-NEG:  {ineg}")
print(f"  I-NSCO: {insco}")
print(f"  I-UNC:  {iunc}")
print(f"  I-USCO: {iusco}")

print("\nE-Tags:")
print(f"  E-NEG:  {eneg}")
print(f"  E-NSCO: {ensco}")
print(f"  E-UNC:  {eunc}")
print(f"  E-USCO: {eusco}")

print(f"\nO-Tags: {o}")

Total words: 163485

B-Tags:
  B-NEG:  3877
  B-NSCO: 3713
  B-UNC:  418
  B-USCO: 411

I-Tags:
  I-NEG:  0
  I-NSCO: 5253
  I-UNC:  4
  I-USCO: 879

E-Tags:
  E-NEG:  76
  E-NSCO: 2511
  E-UNC:  198
  E-USCO: 319

O-Tags: 145826


Processes a list of sentences to count the occurrences of different tags for each word. - Test set

In [None]:
total_words = 0
bneg = 0
bnsco = 0
ineg = 0
insco = 0
eneg = 0
ensco = 0
bunc = 0
busco = 0
iunc = 0
iusco = 0
eunc = 0
eusco = 0
o = 0

for sentence in sentences_test:
  for word in sentence:
    total_words += 1
    if word['tag'] == 'O':
      o += 1
    elif word['tag'] == 'B-NEG':
      bneg += 1
    elif word['tag'] == 'B-NSCO':
      bnsco += 1
    elif word['tag'] == 'B-UNC':
      bunc += 1
    elif word['tag'] == 'B-USCO':
      busco += 1
    elif word['tag'] == 'I-NEG':
      ineg += 1
    elif word['tag'] == 'I-NSCO':
      insco += 1
    elif word['tag'] == 'I-UNC':
      iunc += 1
    elif word['tag'] == 'I-USCO':
      iusco += 1
    elif word['tag'] == 'E-NEG':
      eneg += 1
    elif word['tag'] == 'E-NSCO':
      ensco += 1
    elif word['tag'] == 'E-UNC':
      eunc += 1
    elif word['tag'] == 'E-USCO':
      eusco += 1

print(f"Total words: {total_words}")

print("\nB-Tags:")
print(f"  B-NEG:  {bneg}")
print(f"  B-NSCO: {bnsco}")
print(f"  B-UNC:  {bunc}")
print(f"  B-USCO: {busco}")

print("\nI-Tags:")
print(f"  I-NEG:  {ineg}")
print(f"  I-NSCO: {insco}")
print(f"  I-UNC:  {iunc}")
print(f"  I-USCO: {iusco}")

print("\nE-Tags:")
print(f"  E-NEG:  {eneg}")
print(f"  E-NSCO: {ensco}")
print(f"  E-UNC:  {eunc}")
print(f"  E-USCO: {eusco}")

print(f"\nO-Tags: {o}")


Total words: 41993

B-Tags:
  B-NEG:  1019
  B-NSCO: 971
  B-UNC:  116
  B-USCO: 115

I-Tags:
  I-NEG:  0
  I-NSCO: 1304
  I-UNC:  3
  I-USCO: 251

E-Tags:
  E-NEG:  18
  E-NSCO: 643
  E-UNC:  54
  E-USCO: 88

O-Tags: 37411


Sentences that have at least one word with a tag different of O and sentences that have just tags with O - Train set

In [None]:
parsed_sentences = []
O_sentences = []
for sentence in sentences:
  diff = False
  for word in sentence:
    if word['tag']!='O':
      diff = True
      break
  if diff:
    parsed_sentences.append(sentence)
  else:
    O_sentences.append(sentence)

In [None]:
print(len(parsed_sentences))
print(len(O_sentences))

3577
7573


Sentences that have at least one word with a tag different of O and sentences that have just tags with O - Test set

In [None]:
parsed_sentences_test = []
O_sentences_test = []
for sentence in sentences_test:
  diff = False
  for word in sentence:
    if word['tag']!='O':
      diff = True
      break
  if diff:
    parsed_sentences_test.append(sentence)
  else:
    O_sentences_test.append(sentence)

In [None]:
print(len(parsed_sentences_test))
print(len(O_sentences_test))
print(len(sentences_test))

971
2088
3059


## Start CRF approach

Get CRF features

This function converts a sentence into a list of labels.

In [None]:
def sent2labels(sent):
    return [word['tag'] for word in sent]

Generate feature dictionaries for words in a sentence for use in CRF models

In [None]:

def get_word_to_crf_features(sentence, i):
    word = sentence[i]['word']
    lemma = sentence[i]['lemma']
    pos = sentence[i]['pos']
    suffix = sentence[i]['suffix']

    features = {
        'bias': 1.0,
        'word.lower()':word.lower(),
        'pos': pos,
        'lemma': lemma,
        'suffix':suffix,
        'word.isdigit()':word.isdigit(),

    }

    # Add features for the previous three words within sentence boundaries
    for j in range(1, 4):
        if i - j >= 0:
            word_prev = sentence[i - j]['word']
            pos_prev = sentence[i - j]['pos']
            lemma_prev = sentence[i - j]['lemma']
            suffix_prev = sentence[i-j]['suffix']
            features.update({
                f'-{j}:word.lower()': word_prev.lower(),
                f'-{j}:pos': pos_prev,
                f'-{j}:lemma': lemma_prev,
                f'-{j}:suffix':suffix_prev,
                f'-{j}:word.isdigit()':word_prev.isdigit(),
            })

    # Add features for the next three words within sentence boundaries
    for j in range(1, 4):
        if i + j < len(sentence):
            word_next = sentence[i + j]['word']
            pos_next = sentence[i + j]['pos']
            lemma_next = sentence[i + j]['lemma']
            suffix_next = sentence[i+j]['suffix']
            features.update({
                f'+{j}:word.lower()': word_next.lower(),
                f'+{j}:pos': pos_next,
                f'+{j}:lemma': lemma_next,
                f'+{j}:suffix':suffix_next,
                f'+{j}:word.isdigit()':word_next.isdigit(),
            })

    if i == 0:
      features['bos'] = True
    else:
      features['bos'] = False

    if i+1 == len(sentence):
      features['eos'] = True
    else:
      features['eos'] = False


    return features


def get_sent_to_crf_features(sentence):
    return [get_word_to_crf_features(sentence, i) for i in range(len(sentence))]


Choose train_sents

In [None]:
train_sents = sentences

Choose test_sents

In [None]:
test_sents = sentences_test

Compute X_train and y_train

Preparing training data for a CRF model by extracting features and labels from sentences

In [None]:

X_train = [get_sent_to_crf_features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]


trainer_crf = crfs.Trainer(verbose=False) # Instance a CRF trainer

for xseq, yseq in zip(X_train, y_train):
    trainer_crf.append(xseq, yseq) # Stack the data


In [None]:
print(X_train[0][0])

Compute X_test and y_test

In [None]:
X_test = [get_sent_to_crf_features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

Set Parameters for CRF

In [None]:
trainer_crf.set_params({
    'c1': 1.0,   # Coefficient for L1 regularization
    'c2': 1e-3,  # Coefficient for L2 regularization
    'max_iterations': 100,
    'feature.possible_transitions': True
})

Train CRF

In [None]:
trainer_crf.train('npl_ner_crf.crfsuite') # Train the model and save it locally.
tagger_crf = crfs.Tagger()
tagger_crf.open('npl_ner_crf.crfsuite') # Load the inference API

<contextlib.closing at 0x7e4c992cd450>

In [None]:
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix

Generate a classification report for BIO-tagged sequence

In [None]:
from sklearn.preprocessing import LabelBinarizer
def bio_classification_report(y_true, y_pred):
   
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

Here we generate predictions for the test set using a CRF model.

In [None]:
y_pred_crf = [tagger_crf.tag(x) for x in X_test]
report_crf = bio_classification_report(y_test, y_pred_crf)




In [None]:
print(report_crf)

              precision    recall  f1-score   support

       B-NEG       0.97      0.96      0.97      1019
       E-NEG       0.94      0.89      0.91        18
      B-NSCO       0.95      0.92      0.94       971
      E-NSCO       0.86      0.86      0.86       643
      I-NSCO       0.88      0.88      0.88      1304
       B-UNC       0.91      0.71      0.80       116
       E-UNC       0.87      0.72      0.79        54
       I-UNC       0.00      0.00      0.00         3
      B-USCO       0.91      0.72      0.81       115
      E-USCO       0.61      0.49      0.54        88
      I-USCO       0.69      0.64      0.66       251

   micro avg       0.90      0.87      0.89      4582
   macro avg       0.78      0.71      0.74      4582
weighted avg       0.90      0.87      0.89      4582
 samples avg       0.10      0.10      0.10      4582



Calculate metrics for 'NEG', 'NSCO', 'UNC' and 'USCO'  
Preprocesse the true and predicted BIO-tagged sequences by removing the 'B-' and 'I-' prefixes

In [None]:
prep_y_test = []
prep_y_pred_crf = []
for true_sent , pred_sent in zip(y_test,y_pred_crf):
  prep_true_sent = []
  prep_pred_sent = []
  for true_tag, pred_tag in zip(true_sent,pred_sent):

    if true_tag!='O':
      prep_tag = true_tag[2:]
      prep_true_sent.append(prep_tag)
    else:
      prep_true_sent.append(true_tag)

    if pred_tag!='O':
      prep_tag = pred_tag[2:]
      prep_pred_sent.append(prep_tag)
    else:
      prep_pred_sent.append(pred_tag)
  prep_y_test.append(prep_true_sent)
  prep_y_pred_crf.append(prep_pred_sent)



In [None]:
report_crf_2 = bio_classification_report(prep_y_test, prep_y_pred_crf)

In [None]:
print(report_crf_2)

              precision    recall  f1-score   support

         NEG       0.97      0.96      0.96      1037
        NSCO       0.92      0.91      0.92      2918
         UNC       0.90      0.70      0.79       173
        USCO       0.79      0.69      0.73       454

   micro avg       0.92      0.89      0.90      4582
   macro avg       0.89      0.81      0.85      4582
weighted avg       0.92      0.89      0.90      4582
 samples avg       0.10      0.10      0.10      4582



Compare results

In [None]:
# Compare result for bad sentences tagging
idx = 0
count=0
for true_sent , pred_sent in zip(y_test,y_pred_crf):
  for true_tag, pred_tag in zip(true_sent,pred_sent):
    if true_tag != pred_tag:
      print(idx)
      print("True: ",true_sent)
      print("Pred: ",pred_sent)
      print("---------------")
      count+=1
      break

  idx+=1
print(count)

Tagging example True vs Pred

In [None]:

for y_pred_crf_s, test_sents_s in zip(y_pred_crf[412:416],test_sents[412:416]):
  sent_true = []
  sent_pred = []
  for pred_tag, word in zip(y_pred_crf_s, test_sents_s):
    sent_true.append((word['word'],word['tag']))
    sent_pred.append((word['word'],pred_tag))
  print("True: ",sent_true)
  print("Pred: ",sent_pred)
  print("----------------")

Calculate Precision, Recall and F1  
We calculate the false positives (FP), false negatives (FN), and true positives (TP) for different categories (NEG, UNC, USCO, NSCO)

In [None]:
neg_fp = 0
neg_fn = 0
neg_tp = 0

nsco_fp = 0
nsco_fn = 0
nsco_tp = 0

unc_fp = 0
unc_fn = 0
unc_tp = 0

usco_fp = 0
usco_fn = 0
usco_tp = 0

d = {'NEG' : {'FP' : 0, "FN" : 0, 'TP' : 0}, 'UNC' : {'FP' : 0, "FN" : 0, 'TP' : 0}, 'USCO' : {'FP' : 0, "FN" : 0, 'TP' : 0}, 'NSCO' : {'FP' : 0, "FN" : 0, 'TP' : 0}}

iddd = 0
bla = 0
fn = 0
tp = 0
for p_sen, gt_sen in zip(y_pred_crf, y_test):
  iddd += 1
  for i in range(len(p_sen)):
    category = ""
    # If the prediciton is 'O'
    if p_sen[i] == 'O':
      if gt_sen[i] == 'O' or gt_sen[i][0] == 'I' or gt_sen[i][0] == 'E':
        continue

      category = gt_sen[i][2:]

      d[category]['FN'] += 1
      if category == "NSCO":
        fn += 1
      i += 1

      # Go to the end of the tag. This makes sure that for a sequence of
      # B-NSCO, I-NSCO, E-NSCO only 1 fn is added
      while i < len(p_sen) and gt_sen[i][2:] == category:
        i += 1
      i -= 1

    elif gt_sen[i] == 'O':
      category = p_sen[i][2:]
      d[category]['FP'] += 1
      i += 1
      while i < len(p_sen) and p_sen[i][2:] == category:
        i += 1
      i -= 1

    elif p_sen[i][0] == "B":
      p_category = p_sen[i][2:]
      if gt_sen[i] != p_sen[i]:
        bla += 1
        gt_category = gt_sen[i][2:]
        d[p_category]['FP'] += 1
        d[gt_category]['FN'] += 1
        i += 1
        while i < len(p_sen) and (gt_sen[i][2:] == gt_category or p_sen[i][2:] == p_category):
          i += 1
        i -= 1


      elif gt_sen[i] == p_sen[i]:
        i += 1
        good = True
        while i < len(p_sen):
          if gt_sen[i][2:] == 'O' and p_sen[i][2:] == 'O':
            i -= 1
            break
          if gt_sen[i][2:] != p_category and p_sen[i][2:] != p_category:
            i -= 1
            break
          if gt_sen[i][2:] == p_category and p_sen[i][2:] == p_category:
            i += 1
            continue

          good = False
          d[p_category]['FN'] += 1
          if gt_sen[i][2:] != p_category:
            pass
          elif p_sen[i][2:] != p_category:
            pass
          else:
            print(f'{gt_sen[i]=} {p_sen[i]=}')
         # i-=1
          break
        if good:
          if p_category == "NSCO":
            tp += 1
          d[p_category]["TP"] += 1


In [None]:
for tag, metrics in d.items():
  for metric, nr in metrics.items():
    print(f'{tag}: {metric} {nr}')

NEG: FP 27
NEG: FN 43
NEG: TP 974
UNC: FP 10
UNC: FN 37
UNC: TP 82
USCO: FP 63
USCO: FN 61
USCO: TP 55
NSCO: FP 218
NSCO: FN 146
NSCO: TP 823


In [None]:
precision = []
recall = []
f1 = []

for tag, metrics in d.items():
  p = metrics['TP'] / (metrics['TP'] + metrics['FP'])
  precision.append((tag, p))
  r = metrics['TP'] / (metrics['TP'] + metrics['FN'])
  recall.append((tag, r))
  f1.append((tag, (2 * p * r) / (p + r)))

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1: " + str(f1))

Precision: [('NEG', 0.973026973026973), ('UNC', 0.8913043478260869), ('USCO', 0.4661016949152542), ('NSCO', 0.7905859750240154)]
Recall: [('NEG', 0.9577187807276303), ('UNC', 0.6890756302521008), ('USCO', 0.47413793103448276), ('NSCO', 0.849329205366357)]
F1: [('NEG', 0.9653121902874132), ('UNC', 0.7772511848341233), ('USCO', 0.47008547008547), ('NSCO', 0.8189054726368159)]
