training_set = Tot setul de date, este vectorul care contine toate documentele
document = Element din training_set. Fiecare document contine "data", "annotations" si "predictions"


In [1]:
import re
import json


Parse Training and Testing data from JSON

In [2]:
def parse_json(file_path):

  # Step 2: Open the file in read mode
  try:
    with open(file_path, "r") as json_file:
      # Step 3: Load the JSON data using json.load()
      parsed_file = json.load(json_file)
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
  else:
    print("JSON data parsed successfully!")
    # Step 4: Access and process the data
    # (See examples below based on data structure)
  return parsed_file

In [3]:
training_set = parse_json("./train_data.json")
testing_set = parse_json("./test_data.json")

JSON data parsed successfully!
JSON data parsed successfully!


In [4]:
print(len(training_set))
print(len(testing_set))

254
64


In [5]:
predictions = [document["predictions"] for document in training_set]
texts = [document["data"]["text"] for document in training_set]
test_texts = [document["data"]["text"] for document in testing_set]

Extract terms given by CUTEXT

In [6]:
def extract_terms_from_file(file_path):
    terms = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("Term:"):
                term = line.split("Term:")[1].strip()
                terms.append(term)
    return terms

In [7]:
def parse_terms(extracted_terms):
  new_terms = []
  for term in extracted_terms:
    if term[0].isalpha() and term[-1].isalpha() and "**" not in term and "(" not in term and ")" not in term and len(term)>3:
      new_terms.append(term)
  return new_terms

In [40]:
file_path = "./terms_raw.txt"
# Extract terms from the file
cutext_terms = extract_terms_from_file(file_path)

In [41]:
print(len(cutext_terms))

21554


In [42]:
cutext_terms = parse_terms(cutext_terms)

Extract NEG, UNC, NSCO and USCO from Training Data

In [43]:
# Gets a list of tuples representing character offsets and returns list of words
def get_words(text, offsets):
  words = []
  for start, end in offsets:
    #words.append(text[start:end-1])
    #words.append(text[start-1:end])
    if text[start-1].isalpha():
      s=start-1
    else:
      s=start
    if text[end-1].isalpha():
      e=end
    else:
      e=end-1
    words.append(text[s:e])
  return words

In [13]:
# Parses a document and returns 4 lists of tuples representing words
def find_cues_and_scopes(document):
  neg_postitions_pairs = [(result_element["value"]["start"], result_element["value"]["end"]) for result_element in document["predictions"][0]["result"] if "NEG" in result_element["value"]["labels"]]
  unc_postitions_pairs = [(result_element["value"]["start"], result_element["value"]["end"]) for result_element in document["predictions"][0]["result"] if "UNC" in result_element["value"]["labels"]]
  nsco_postitions_pairs = [(result_element["value"]["start"], result_element["value"]["end"]) for result_element in document["predictions"][0]["result"] if "NSCO" in result_element["value"]["labels"]]
  usco_postitions_pairs = [(result_element["value"]["start"], result_element["value"]["end"]) for result_element in document["predictions"][0]["result"] if "USCO" in result_element["value"]["labels"]]
  neg_words = get_words(document["data"]["text"], neg_postitions_pairs)
  unc_words = get_words(document["data"]["text"], unc_postitions_pairs)
  nsco_words = get_words(document["data"]["text"], nsco_postitions_pairs)
  usco_words = get_words(document["data"]["text"], usco_postitions_pairs)
  return neg_words, unc_words, nsco_words, usco_words

In [44]:
NEG = set()
UNC = set()
NSCO = set()
USCO = set()

num_nsco = 0
num_usco = 0
for document in training_set:
  neg_words, unc_words, nsco_words, usco_words = find_cues_and_scopes(document)
  nsco_words_set = set(nsco_words)
  usco_words_set = set(usco_words)
  num_nsco += len(nsco_words)
  num_usco += len(usco_words)

  NEG.update(neg_words)
  UNC.update(unc_words)
  NSCO.update(nsco_words)
  USCO.update(usco_words)

# Removing spaces and punctation signs from the start and end of each string
NEG = {word.strip(" ,.!?;)") for word in NEG}
UNC = {word.strip(" ,.!?);") for word in UNC}
NSCO = {word.strip(" ,.!?;)") for word in NSCO}
USCO = {word.strip(" ,.!?);") for word in USCO}

# Remove negation from UNC
for word in NEG:
  if word in UNC:
    UNC.remove(word)

Combine USCO and NSCO in SCOPE_words

In [45]:
ALL_SCOPES = NSCO.union(USCO)

# A set with all individual words from the scopes
SCOPE_words = set()         # ['erc', '(29/05/18)', 'ser', 'visibles', 'extratono', 'inicia', 'valor', 'frialdad', 'medicamentoses', 'neoformativo']
for scope in ALL_SCOPES:
  SCOPE_words.update(scope.split())

print("NEG_UNC words before processing: ", len(SCOPE_words))

# Remove all symbols and numbers from the set
SCOPE_words = {word for word in SCOPE_words if word.isalpha()}
SCOPE_words = list(SCOPE_words)

NEG_UNC words before processing:  3184


Combine SCOPE_words with extracted_terms from CUTEXT

In [46]:
extracted_terms = list(set(cutext_terms+SCOPE_words))

In [47]:
print(len(extracted_terms))


19959


In [48]:
extracted_terms.sort(key=len,reverse=True)

Prepare REGEX

In [49]:
NEG_pattern = "|".join(NEG)
UNC_pattern = "|".join(UNC)
#SCOPE pattern with CUTEXT + NSCO+USCO
SCOPE_pattern = "|".join(extracted_terms)
#SCOPE pattern baselie
SCOPE_pattern_baseline = "|".join(SCOPE_words)

In [254]:
#SCOPE pattern just for CUTEXT
SCOPE_pattern_CUTEXT = "|".join(cutext_terms)

In [28]:

regex_neg_pos=rf"\b({SCOPE_pattern})\b\s\b({NEG_pattern})\b"

regex_unc_pos=rf"\b({SCOPE_pattern})\b\s\b({UNC_pattern})\b"


REGEX Baseline

In [278]:
regex_neg_pre =rf"\b({NEG_pattern})\b\s+((?:\b(?:{SCOPE_pattern_baseline})\b\s*){{0,5}})"
regex_unc_pre=rf"\b({UNC_pattern})\b\s+((?:\b(?:{SCOPE_pattern_baseline})\b\s*){{0,5}})"

REGEX CUTEXT

In [289]:
regex_neg_pre =rf"\b({NEG_pattern})\b\s+((?:\b(?:{SCOPE_pattern_CUTEXT})\b\s*){{0,5}})"
regex_unc_pre=rf"\b({UNC_pattern})\b\s+((?:\b(?:{SCOPE_pattern_CUTEXT})\b\s*){{0,5}})"

REGEX1

In [296]:
regex_neg_pre =rf"\b({NEG_pattern})\b\s+((?:\b(?:{SCOPE_pattern})\b\s*){{0,5}})"
regex_unc_pre=rf"\b({UNC_pattern})\b\s+((?:\b(?:{SCOPE_pattern})\b\s*){{0,5}})"

REGEX2

In [304]:
regex_neg_pre =rf"\b({NEG_pattern})\b\s*((?:\b(?:{SCOPE_pattern})\b\s*){{0,5}})"
regex_unc_pre=rf"\b({UNC_pattern})\b\s*((?:\b(?:{SCOPE_pattern})\b\s*){{0,5}})"

REGEX until the end of the proposition

In [50]:
regex_neg_pre = rf"\b({NEG_pattern})\b\s*(.*?)\."
regex_unc_pre = rf"\b({UNC_pattern})\b\s*(.*?)\."

In [51]:
print(len(SCOPE_pattern))

324031


Make Predictions

In [52]:
predictions = []
for i in range(len(test_texts)):
  dict = {"NEG":set(),"NSCO":set(),"UNC":set(),"USCO":set()}

  predictions.append(dict)


for id, test_text in enumerate(test_texts):
  neg_scopes_pre_matches = re.finditer(regex_neg_pre, test_text)
  neg_scopes_pos_matches = re.finditer(regex_neg_pos, test_text)
  unc_scopes_pre_matches = re.finditer(regex_unc_pre, test_text)
  unc_scopes_pos_matches = re.finditer(regex_unc_pos, test_text)

  if neg_scopes_pre_matches:
    for match in neg_scopes_pre_matches:
        #print("Whole match:", match.group(0))
        # Get the matched word and its starting/ending positions
        matched_word = match.group(1)
        start_pos = match.start(1)
        end_pos = match.end(1)+1
        #print(f"Found '{matched_word}' at positions ({start_pos}, {end_pos})")

        predictions[id]["NEG"].add((start_pos,end_pos,matched_word))

        # # Get the scope word
        scope_word = match.group(2)
        sc_start_pos = end_pos
        sc_end_pos = match.end(2)+1

        predictions[id]["NSCO"].add((sc_start_pos,sc_end_pos,scope_word))

        #print(f"Found scope '{scope_word}' at positions ({sc_start_pos}, {sc_end_pos})")
  '''
  if neg_scopes_pos_matches:
    for match in neg_scopes_pos_matches:
        #print("Whole match:", match.group(0))
        # Get the matched word and its starting/ending positions
        scope_word = match.group(1)
        sc_start_pos = match.start()
        sc_end_pos = match.end(1)+1


       #print(f"Found '{scope_word}' at positions ({sc_start_pos}, {sc_end_pos})")
        # # Get the scope word
        matched_word = match.group(2)
        start_pos = sc_end_pos
        end_pos = match.end(2)+1


        predictions[id]["NEG"].add((start_pos,end_pos,matched_word))
        predictions[id]["NSCO"].add((sc_start_pos,sc_end_pos,scope_word))

        #print(f"Found scope '{match_word}' at positions ({start_pos}, {end_pos})")
  '''
  if unc_scopes_pre_matches:
    for match in unc_scopes_pre_matches:
        #print("Whole match:", match.group(0))
        # Get the matched word and its starting/ending positions
        matched_word = match.group(1)
        start_pos = match.start()
        end_pos = match.end(1)+1
        #print(f"Found '{matched_word}' at positions ({start_pos}, {end_pos})")

        predictions[id]["UNC"].add((start_pos,end_pos,matched_word))


        # # Get the scope word
        scope_word = match.group(2)
        sc_start_pos = end_pos
        sc_end_pos = match.end(2)+1
        #print(f"Found scope '{scope_word}' at positions ({sc_start_pos}, {sc_end_pos})")

        predictions[id]["USCO"].add((sc_start_pos,sc_end_pos,scope_word))
    '''
    if unc_scopes_pos_matches:
      for match in unc_scopes_pos_matches:
          #print("Whole match:", match.group(0))
          # Get the matched word and its starting/ending positions
          scope_word = match.group(1)
          sc_start_pos = match.start()
          sc_end_pos = match.end(1)+1
          #print(f"Found '{scope_word}' at positions ({sc_start_pos}, {sc_end_pos})")
          # # Get the scope word
          matched_word = match.group(2)
          start_pos = sc_end_pos
          end_pos = match.end(2)+1
          #print(f"Found scope '{matched_word}' at positions ({start_pos}, {end_pos})")

          predictions[id]["UNC"].add((start_pos,end_pos,matched_word))
          predictions[id]["USCO"].add((sc_start_pos,sc_end_pos,scope_word))
    '''

Sort the text predictions by starting point

In [53]:
for dict in predictions:
    for key,value in dict.items():

      sorted_value=sorted(list(value), key=lambda x: x[0])
      dict[key] = sorted_value

In [54]:
print(predictions[0])

{'NEG': [(395, 398, 'no'), (1111, 1120, 'negativo'), (1141, 1144, 'no'), (1313, 1322, 'negativo'), (2118, 2122, 'sin')], 'NSCO': [(398, 563, 'alergias medicamentosas conocidas antcededentes medico-quirurgicos: protesis mamaria, adenoidectomia niega habitos toxicos medicacio habitual anafranil25 mg/ diario'), (1120, 1120, ''), (1144, 1204, 'inmune, toxoplasma no immune, lues vih, vhb y vhc negativos'), (1322, 1346, '- eco 1º t: crl:60 tn:1'), (2122, 2134, 'incidencias')], 'UNC': [(3460, 3466, 'puede')], 'USCO': [(3466, 3537, 'alternarse cada 4 horas con 1 comprimido de ibuprofeno 600mg si dolor)')]}


Get ground thruth from testing_set

In [55]:
def get_gt_format(document):
    neg_predictions, unc_predictions, nsco_predictions, usco_predictions = [], [], [], []
    text = document["data"]["text"]
    for result_element in document["predictions"][0]["result"]:
        start = result_element["value"]["start"]
        end = result_element["value"]["end"]
        if "NEG" in result_element["value"]["labels"]:
            neg_predictions.append((start, end, text[start:end]))
        if "UNC" in result_element["value"]["labels"]:
            unc_predictions.append((start, end, text[start:end]))
        if "NSCO" in result_element["value"]["labels"]:
            nsco_predictions.append((start, end, text[start:end]))
        if "USCO" in result_element["value"]["labels"]:
            usco_predictions.append((start, end, text[start:end]))

    return neg_predictions, unc_predictions, nsco_predictions, usco_predictions

In [35]:
# FORMAT : (NEG, START, END, WORD)
def get_ground_truth(document):
    neg_results, unc_results, nsco_results, usco_results = get_gt_format(document)

    neg_results_sorted = sorted(neg_results, key=lambda x: x[0])
    unc_results_sorted = sorted(unc_results, key=lambda x: x[0])
    nsco_results_sorted = sorted(nsco_results, key=lambda x: x[0])
    usco_results_sorted = sorted(usco_results, key=lambda x: x[0])

    ground_truth_dict = {"NEG": neg_results_sorted, "UNC": unc_results_sorted, "NSCO": nsco_results_sorted, "USCO": usco_results_sorted}

    return ground_truth_dict



get_ground_truth(testing_set[0])

{'NEG': [(395, 398, 'no '),
  (499, 505, 'niega '),
  (1111, 1119, 'negativo'),
  (1141, 1144, 'no '),
  (1163, 1166, 'no '),
  (1194, 1203, 'negativos'),
  (2118, 2122, 'sin ')],
 'UNC': [],
 'NSCO': [(398, 422, 'alergias medicamentosas '),
  (505, 521, 'habitos toxicos '),
  (1107, 1111, 'vih '),
  (1144, 1150, 'inmune'),
  (1166, 1172, 'immune'),
  (1174, 1194, 'lues vih, vhb y vhc '),
  (2122, 2133, 'incidencias')],
 'USCO': []}

In [36]:
# List of dictionaries of GT docuemnts in the test set
ground_truths = [get_ground_truth(document) for document in testing_set]


Calculate Metrics

In [37]:
def calculate_metrics(predictions,ground_truths):
  precision = {"NEG":0,"NSCO":0,"UNC":0,"USCO":0}
  recall = {"NEG":0,"NSCO":0,"UNC":0,"USCO":0}
  f1 = {"NEG":0,"NSCO":0,"UNC":0,"USCO":0}
  tp = {"NEG":0,"NSCO":0,"UNC":0,"USCO":0}
  num_of_predictions = {"NEG":0,"NSCO":0,"UNC":0,"USCO":0}
  num_of_ground_truths = {"NEG":0,"NSCO":0,"UNC":0,"USCO":0}
  for d1,d2 in zip(predictions,ground_truths):

    #print(d1["UNC"])
    #print(d2["UNC"])
    for key in d1:
      #print(key)
      for elem in d1[key]:
        for elem2 in d2[key]:
          if abs(elem[0]-elem2[0]) <= 1 and abs(elem[1]-elem2[1]) <=1:
            tp[key]+=1
            break

      num_of_predictions[key]+=len(d1[key])
      num_of_ground_truths[key]+=len(d2[key])

  for key in precision:
    precision[key] = tp[key]/num_of_predictions[key]
    recall[key] = tp[key]/num_of_ground_truths[key]
    f1[key] = 2*precision[key]*recall[key]/(precision[key]+recall[key])


  return precision, recall, f1


In [56]:
precision, recall, f1 = calculate_metrics(predictions,ground_truths)

Baseline


In [282]:
print(precision)
print(recall)
print(f1)

{'NEG': 0.924812030075188, 'NSCO': 0.5366541353383458, 'UNC': 0.5963855421686747, 'USCO': 0.15060240963855423}
{'NEG': 0.8692579505300353, 'NSCO': 0.5316573556797021, 'UNC': 0.7557251908396947, 'USCO': 0.1937984496124031}
{'NEG': 0.8961748633879781, 'NSCO': 0.5341440598690365, 'UNC': 0.6666666666666667, 'USCO': 0.16949152542372883}


CUTEXT




In [293]:
print(precision)
print(recall)
print(f1)

{'NEG': 0.9217148182665424, 'NSCO': 0.17054986020503263, 'UNC': 0.5857988165680473, 'USCO': 0.08284023668639054}
{'NEG': 0.8736749116607774, 'NSCO': 0.17039106145251395, 'UNC': 0.7557251908396947, 'USCO': 0.10852713178294573}
{'NEG': 0.8970521541950113, 'NSCO': 0.17047042384722869, 'UNC': 0.66, 'USCO': 0.09395973154362416}


CUTEXT + Scope_words REGEX 1

In [301]:
print(precision)
print(recall)
print(f1)

{'NEG': 0.924314096499527, 'NSCO': 0.5799432355723746, 'UNC': 0.5963855421686747, 'USCO': 0.22289156626506024}
{'NEG': 0.8630742049469965, 'NSCO': 0.5707635009310987, 'UNC': 0.7557251908396947, 'USCO': 0.2868217054263566}
{'NEG': 0.8926450433988123, 'NSCO': 0.5753167526982637, 'UNC': 0.6666666666666667, 'USCO': 0.2508474576271187}


CUTEXT + Scope_words REGEX 2

In [309]:
print(precision)
print(recall)
print(f1)

{'NEG': 0.9249578414839797, 'NSCO': 0.5168634064080945, 'UNC': 0.5706214689265536, 'USCO': 0.20903954802259886}
{'NEG': 0.9690812720848057, 'NSCO': 0.5707635009310987, 'UNC': 0.7709923664122137, 'USCO': 0.2868217054263566}
{'NEG': 0.9465056082830027, 'NSCO': 0.5424778761061947, 'UNC': 0.6558441558441559, 'USCO': 0.2418300653594771}


Until the end of the proposition


In [57]:
print(precision)
print(recall)
print(f1)

{'NEG': 0.9134615384615384, 'NSCO': 0.55, 'UNC': 0.6329113924050633, 'USCO': 0.3227848101265823}
{'NEG': 0.8392226148409894, 'NSCO': 0.5325884543761639, 'UNC': 0.7633587786259542, 'USCO': 0.3953488372093023}
{'NEG': 0.874769797421731, 'NSCO': 0.5411542100283822, 'UNC': 0.6920415224913494, 'USCO': 0.3554006968641115}
