In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 63.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 45.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K  

In [None]:
from transformers import pipeline, set_seed

class llmodel :
  def compute(self, prompt) :
    pass

class gpt2model(llmodel):
  def __init__(self, max_new_tokens) :
    self.max_new_tokens = max_new_tokens
    self.generator = pipeline('text-generation', model='gpt2')
    set_seed(42)
  def compute(self, prompt) :
    output = self.generator(prompt, max_new_tokens = self.max_new_tokens, num_return_sequences=1)
    if(len(output) == 0) :
      return ""
    output = output[0]['generated_text']
    if(output.startswith(prompt)) :
      output = output[len(prompt):]
    return output

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class tokenizedmodels(llmodel):
  def __init__(self, modelname) :
    self.tokenizer = AutoTokenizer.from_pretrained(modelname)
    self.model = AutoModelForSeq2SeqLM.from_pretrained(modelname)
  def compute(self, prompt) :
    inputs = self.tokenizer.encode(prompt, return_tensors="pt")
    outputs = self.model.generate(inputs)
    output = self.tokenizer.decode(outputs[0])
    return output

class t0model(tokenizedmodels):
  def __init__(self) :
    super().__init__("bigscience/T0pp")

class inboxbartmodel(tokenizedmodels):
  def __init__(self) :
    super().__init__("cogint/in-boxbart")

import requests

class gpt3model(llmodel):
  def __init__(self, max_tokens = 1024, temperature = 0.7, model_name = 'text-davinci-002') :
    self.headers = {'Authorization': 'Bearer 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'}
    self.model_name = model_name
    self.temperature = temperature
    self.max_tokens= max_tokens
    pass
  def compute(self, prompt) :
    data = {
      'model': self.model_name,
      'prompt': prompt,
      'temperature': self.temperature,
      'max_tokens': self.max_tokens
    }
    res = requests.post('https://api.openai.com/v1/completions', headers=self.headers, json=data)
    if res.status_code != 200 :
      return ""
    return res.json()["choices"][0]["text"]



In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself"},
 {'generated_text': "Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know"},
 {'generated_text': "Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that"},
 {'generated_text': "Hello, I'm a language model, a functional model... It's not me, it's me!\n\nI won't bore you with how"},
 {'generated_text': "Hello, I'm a language model, not an object model.\n\nIn a nutshell, I need to give language model a set of properties that"}]

In [None]:
# gpt2model(30).compute("What is the right response to the question, 'How are you?' ?")

# t0model().compute("What is the right response to the question, 'How are you?' ?")

# inboxbartmodel().compute("What is the right response to the question, 'How are you?' ?")

gpt3model(128).compute("What is the right response to the question, 'How are you?' ?")

"\n\nThe right response to the question, 'How are you?' is, 'I'm doing well, thank you. And you?'"

In [None]:
import json

# deid_2006_file = open("/content/drive/MyDrive/CSE576project2/de-identification_2006_text_generation.json", "r")
# deid_2006_data = json.load(deid_2006_file)
# deid_2006_file.close()

# f = open("/content/drive/MyDrive/NLP Project/CRAFT_Dataset/NER_CRAFT_text_generation.json")
# data = json.load(f)

# # print(deid_2006_data)
# print(data)

In [None]:
f = open("/content/drive/MyDrive/NLP Project/CRAFT_Dataset/NER_CRAFT_text_generation.json")
deid_2006_data = json.load(f)
# deid_2006_data

In [None]:
deid_2006_data["instances"][0]["output"][0]

['genes <SO>, chicken <Taxon>, Sox9 <GGP>, Bmpr1b <GGP>, L - Sox5 <GGP>, Sox6 <GGP>, matrix structural components <GO>, Col2a1 <GGP>, Agg <GGP>']

In [None]:
deid_2006_data["Positive Examples"][0]["output"]

'beta - catenin <GGP>, Lef1 <GGP>, Dkk1 <GGP>, deletion <SO>, beta - catenin <GGP>, mice <Taxon>, mice <NCBITaxon>'

In [None]:
parse_tags_from_string(deid_2006_data["Positive Examples"][0]["output"])

{'GGP': {'beta - catenin': 2, 'Lef1': 1, 'Dkk1': 1},
 'SO': {'deletion': 1},
 'Taxon': {'mice': 1},
 'NCBITaxon': {'mice': 1}}

In [None]:
'''
parsed tags are stored as 
tags[tag_type][tag_str] = occurence_count

example
tags = {
  'DOCTOR' : {
    'Mihir' : 3,
    'Neeraj' : 1
  }
  'PATIENT' : {
    'Varun' : 2,
    'Abhilash' : 2,
    'Arjun' : 3
  }
}
'''

def get_count_of_tags(tags) :
  tags_count = 0
  for tag_type, tag_strs_map in tags.items() :
    for tag_str, occurence_count in tag_strs_map.items() :
      tags_count += occurence_count
  return tags_count

def add_tag(tags, tag_type, tag_str, occurence_count = 1) :
  if occurence_count == 0 :
    return
  if tag_type not in tags :
    tags[tag_type] = {}
  if tag_str not in tags[tag_type] :
    tags[tag_type][tag_str] = 0
  tags[tag_type][tag_str] += occurence_count

def parse_tags_from_string(tags_str) :
  res = {}
  tags = tags_str.split(">")
  for i, tag in enumerate(tags) :
    if(len(tag) == 0) :
      continue
    if(i > 0 and tag[0] == ",") :
      tag = tag[1:]
    tag_split = tag.split("<")
    if(len(tag_split) != 2) :
      continue
    (tag_str, tag_type) = (tag_split[0].strip(), tag_split[1].strip())
    if(len(tag_str) == 0 or len(tag_type) == 0) :
      continue
    add_tag(res, tag_type, tag_str)
  return res

def parse_tags_from_tags_list(tags_list) :
  res = {}
  for tag in tags_list :
    tag = tag.strip()
    tag_split = tag.split("<")
    if(len(tag_split) != 2) :
      continue
    if(len(tag_split[1]) >= 1 and tag_split[1][-1] == ">") :
      tag_split[1] = tag_split[1][0:-1]
    (tag_str, tag_type) = (tag_split[0].strip(), tag_split[1].strip())
    if(len(tag_str) == 0 or len(tag_type) == 0) :
      continue
    add_tag(res, tag_type, tag_str)
  return res

def union_of_tags(tags_a, tags_b) :
  tags = {}

  for tag_type, tag_strs_map in tags_a.items() :
    for tag_str, occurence_count in tag_strs_map.items() :
      if(occurence_count == 0) :
        continue
      add_tag(tags, tag_type, tag_str, occurence_count)

  for tag_type, tag_strs_map in tags_b.items() :
    for tag_str, occurence_count in tag_strs_map.items() :
      if(occurence_count == 0) :
        continue
      add_tag(tags, tag_type, tag_str, occurence_count)

  return tags

def intersection_of_tags(tags_a, tags_b) :
  tags = {}

  for tag_type in tags_a.keys() :
    if tag_type not in tags_b :
      continue
    for tag_str in tags_a[tag_type].keys() :
      if tag_str not in tags_b[tag_type] :
        continue
      add_tag(tags, tag_type, tag_str, min(tags_a[tag_type][tag_str], tags_b[tag_type][tag_str]))

  return tags

def check_equals(tags1, tags2) :
  tag_types1 = set(tags1.keys())
  tag_types2 = set(tags2.keys())
  if(tag_types1 != tag_types2) :
    return False
  for tag_type in tag_types1 :
    if(tags1[tag_type] != tags2[tag_type]) :
      return False
  return True

def generate_accuracy_results(predicted_tags, expected_tags) :
  false_positives = {}
  true_negatives = {}

  for tag_type in predicted_tags :
    tag_type_not_in_expected_tags = tag_type not in expected_tags
    for tag_str in predicted_tags[tag_type] :
      if tag_type_not_in_expected_tags or (tag_str not in expected_tags[tag_type]) :
        add_tag(false_positives, tag_type, tag_str, predicted_tags[tag_type][tag_str])
      elif predicted_tags[tag_type][tag_str] > expected_tags[tag_type][tag_str] :
        add_tag(false_positives, tag_type, tag_str, predicted_tags[tag_type][tag_str] - expected_tags[tag_type][tag_str])
  
  for tag_type in expected_tags :
    tag_type_not_in_predicted_tags = tag_type not in predicted_tags
    for tag_str in expected_tags[tag_type] :
      if tag_type_not_in_predicted_tags or (tag_str not in predicted_tags[tag_type]) :
        add_tag(true_negatives, tag_type, tag_str, expected_tags[tag_type][tag_str])
      elif expected_tags[tag_type][tag_str] > predicted_tags[tag_type][tag_str] :
        add_tag(true_negatives, tag_type, tag_str, expected_tags[tag_type][tag_str] - predicted_tags[tag_type][tag_str])
  
  return false_positives, true_negatives

# test cases for the above functions

t1 = parse_tags_from_string(deid_2006_data["Positive Examples"][0]["output"])
# t2 = parse_tags_from_tags_list(deid_2006_data["instances"][0]["output"])

print(t1)
# print(t2)

# t3 = union_of_tags({}, t2)

# print(check_equals(t1, t2))
# print(check_equals(t3, t2))

# t4 = union_of_tags(t1, t2)

# t5 = union_of_tags(t2, t1)

# print(check_equals(t4, t5))

# predicted_tags = {"DOCTOR" : {"Roopa" : 1, "Kautilya" : 2}, "HOSPITAL" : {"Mamta" : 2, "Aradhana" : 3}, "AGE" : {"95" : 1} }
# expected_tags = {"DOCTOR" : {"Kautilya" : 2}, "PATIENT" : {"Rohan" : 1}, "HOSPITAL" : {"Visveswara" : 1, "Aradhana" : 2}, "AGE" : {"95" : 3} }

# false_positives, true_negatives = generate_accuracy_results(predicted_tags, expected_tags)
# print(get_count_of_tags(false_positives))
# print(false_positives)
# print(get_count_of_tags(true_negatives))
# print(true_negatives)

# print(intersection_of_tags(predicted_tags, expected_tags))

{'GGP': {'beta - catenin': 2, 'Lef1': 1, 'Dkk1': 1}, 'SO': {'deletion': 1}, 'Taxon': {'mice': 1}, 'NCBITaxon': {'mice': 1}}


In [None]:
def input_to_input_sequences(input, word_length) :
  output = []
  input = input.strip()
  in_space = False
  spaces_seen = 0
  for i in range(0, len(input)) :
    if input[i].isspace() :
      in_space = True
    else :
      if in_space :
        spaces_seen += 1
        if spaces_seen == word_length :
          spaces_seen = 0
          output.append("")
      in_space = False
    if len(output) == 0 :
      output.append("")
    output[-1] += input[i]
  return output

# test cases for the above function
input_to_input_sequences(" How are you?  Mr. Rohan.   Are  you fine and alright?  Ms. Padukone. This is a  very precious quirk  to possess Mr. Rohan.  ", 3)
    

['How are you?  ',
 'Mr. Rohan.   Are  ',
 'you fine and ',
 'alright?  Ms. Padukone. ',
 'This is a  ',
 'very precious quirk  ',
 'to possess Mr. ',
 'Rohan.']

In [None]:
# these are prompts for identifying individual tag types

# common prompt prefix

prompt_definition_prefix = "In this task, you are given a small paragraph, your task is to identify all the named entities from the given input and also provide one of the following type for each entities: (1) CHEBI, (2) CL, (3) GGP, (4) GO, (5) NCBITaxon, (6) SO, and (7) Taxon. Generate the output in this format: entity1 <type_of_entity1>, entity2 <type_of_entity2>."

# building prompt for deid-2006

# tag_definitions_2006 = {
#     'AGE':       "AGE: Patient whose age is above 90.",
#     'DATE' :     "DATE: Include all elements of the date except year.",
#     'DOCTOR' :   "DOCTOR: Refers to medical doctors and other practitioners mentioned in the records, it excludes the titles, such as Dr. and MD.",
#     'HOSPITAL' : "HOSPITAL: Marks the names of medical organizations and of nursing homes where patients are treated and may also reside. It includes room numbers of patients and buildings and floors related to doctors\u2019 affiliations.",
#     'ID' :       "ID: Refers to any combination of numbers, letters, and special characters identifying medical records, patients, doctors, or hospitals.",
#     'LOCATION' : "LOCATION: Includes geographic locations such as cities, states, street names, zip codes, building names, and numbers.",
#     'PATIENT' :  "PATIENT: Includes the first and last names of patients, their health proxies, and family members. It excludes titles, such as Mr. and Mrs.",
#     'PHONE' :    "PHONE: Includes telephone, pager, and fax numbers.",
# }


tag_definitions_2006 = {
    'CHEBI':      "CHEBI: Chemical Entities of Biological Interest",
    'CL' :        "CL: Cell Ontology",
    'GGP' :       "GGP: Entrez Gene",
    'GO' :        "GO: Gene Ontology (biological process, cellular component, and molecular function)",
    'NCBITaxon' : "NCBITaxon: NCBI Taxonomy",
    'SO' :        "SO: Protein Ontology",
    'Taxon' :     "Taxon: Sequence Ontology"
}

example_input_2006 = "Conversely , down - regulation of beta - catenin signaling ( through Lef1 knock - out , ectopic expression of Wnt inhibitor Dkk1 or conditional deletion of beta - catenin in epidermis ) results in loss of vibrissae and some pelage follicles in mice [ 28 - 30 ]."

example_output_2006 = {
    'CHEBI':      "",
    'CL' :        "",
    'GGP' :       "GGP: beta - catenin, Lef1, Dkk1",
    'GO' :        "",
    'NCBITaxon' : "mice",
    'SO' :        "deletion",
    'Taxon' :     "mice"
}

# building prompt for deid-2014

# tag_definitions_2014 = {
#     'AGE':         "AGE: Patient whose age is above 90.",
#     'Contact':     "Contact: Contact is the main category for the PHI tags related to contact information which contains PHONE, FAX, EMAIL, and URL tags as PHI tags (sub-category).",
#     'DATE' :       "DATE: Include all elements of the date.",
#     'Id':          "ID: Refers to any combination of numbers, letters, and special characters identifying medical records, patients, doctors, or hospitals. ID contains MEDICALRECORD, SSN, ACCOUNT, LICENSE, DEVICE, IDNUM, BIOID, HEALTHPLAN, and VEHICLE as PHI tags (sub-category).",
#     'Location':    "Location: Includes geographic locations such as cities, states, street names, zip codes, building names, and numbers. The location contains HOSPITAL, COUNTRY, ORGANIZATION, ZIP, STREET, CITY, STATE, LOCATION-OTHER as PHI tags (sub-category).",
#     'Name' :       "Name: Name indicates the named entity for the doctor, patient. It contains PATIENT, DOCTOR, and USERNAME as PHI tags (sub-category). The PATIENT tag includes the first and last names of patients, their health proxies, and family members. It excludes titles, such as Mr. and Mrs. For the DOCTOR tag, it refers to medical doctors and other practitioners mentioned in the records, it excludes the titles, such as Dr. and MD.",
#     'PROFESSION' : "PROFESSION: Refers to the profession of any person, if mentioned in the record.",
# }

# example_input_2014 = "Record date: 2083-08-28 Beaumont Hospital Physician Group 131 Crowley Avenue Rhodes, Yajaira Rutland, FL 94265 97627182 (153) 636-7011 (584) 728-8734 SUBJECTIVE: The patient comes in stating that she has had 5 days of substernal pain like a \"ball pressing in on her chest.\" She has also had abdominal pain and has felt quite poorly. She did call this service this past weekend she states and never received a call back. She called me this morning, and I saw her immediately. The pain has not gone down her arms and it is mostly located in her chest. She feels occasionally like she is going to throw up, and she says that her blood sugars have been quite high. She does have diabetes. PHYSICAL EXAMINATION: Her lungs are clear to P&A. Her blood pressure is 130/80 in the right arm sitting with a pulse of 64. Her neck veins are flat, carotids 2+ and equal without bruit. Heart is not enlarged. There is a regular sinus rhythm, and no significant murmurs. Liver and spleen are not felt. EKG taken shows a possible old anterior myocardial infarction, which is a distinct change from her last EKG with poor progression of the R waves across the precordium. IMPRESSION AND PLAN: Question myocardial infarction. Because of her prolonged pain and changes in the EKG, I think that she should be observed in the BH emergency ward and attend her there. ______________________________ Brett F. Rutherford, M.D. eScription document:7-351769 RFFocus DD: 08/28/83 DT: 08/28/83 DV: 08/28/83"

# example_output_2014 = {
#     'AGE':         "",
#     'Contact':     "(153) 636-7011 <PHONE>, (584) 728-8734 <PHONE>",
#     'DATE' :       "2083-08-28 <DATE>, 08/28/83 <DATE>",
#     'Id':          "7-351769 <MEDICALRECORD>, 97627182 <MEDICALRECORD>",
#     'Location':    "BH <HOSPITAL>, FL <STATE>, Beaumont Hospital <HOSPITAL>, 131 Crowley Avenue <STREET>, Rutland <CITY>",
#     'Name' :       "Brett F. Rutherford <DOCTOR>, Rhodes, Yajaira <PATIENT>",
#     'PROFESSION' : "",
# }

# common prompt suffix

prompt_suffix = "Given a text as input, your task is to identify all the named entities from the given input and also provide one of the following type for each entities: (1) CHEBI, (2) CL, (3) GGP, (4) GO, (5) NCBITaxon, (6) SO, and (7) Taxon."

# fundtions below build prompts for all possible types and returns the list

def build_prompts_for_de_identification_2006(input) :
  res = []
  for tag_type in tag_definitions_2006.keys() :
    res.append(prompt_definition_prefix + tag_definitions_2006[tag_type] + "\n\nRecord:\n" + example_input_2006 + "\n\nOutput:\n" + example_output_2006[tag_type] + "\n\nRecord:\n" + input + "\n\n" + prompt_suffix)
  return res

# def build_prompts_for_de_identification_2014(input) :
#   res = []
#   for tag_type in tag_definitions_2014.keys() :
#     res.append(prompt_definition_prefix + tag_definitions_2014[tag_type] + "\n\nRecord:\n" + example_input_2014 + "\n\nOutput:\n" + example_output_2014[tag_type] + "\n\nRecord:\n" + input + "\n\n" + prompt_suffix)
#   return res

# test cases for above functions
ts_2006 = build_prompts_for_de_identification_2006("Dr Jason treated Jason.")
for ts in ts_2006 :
  print(ts)

# ts_2014 = build_prompts_for_de_identification_2014("Dr Jason treated Jason.")
# for ts in ts_2014 :
#   print(ts)

In this task, you are given a small paragraph, your task is to identify all the named entities from the given input and also provide one of the following type for each entities: (1) CHEBI, (2) CL, (3) GGP, (4) GO, (5) NCBITaxon, (6) SO, and (7) Taxon. Generate the output in this format: entity1 <type_of_entity1>, entity2 <type_of_entity2>.CHEBI: Chemical Entities of Biological Interest

Record:
Conversely , down - regulation of beta - catenin signaling ( through Lef1 knock - out , ectopic expression of Wnt inhibitor Dkk1 or conditional deletion of beta - catenin in epidermis ) results in loss of vibrissae and some pelage follicles in mice [ 28 - 30 ].

Output:


Record:
Dr Jason treated Jason.

Given a text as input, your task is to identify all the named entities from the given input and also provide one of the following type for each entities: (1) CHEBI, (2) CL, (3) GGP, (4) GO, (5) NCBITaxon, (6) SO, and (7) Taxon.
In this task, you are given a small paragraph, your task is to ident

In [None]:
def remove_output_prefix_from_result(result) :
  if(result.startswith("Output") or result.startswith("output")) :
    result = result[6:-1]
    if(result.startswith(" :")):
      result = result[2:-1]
    elif(result.startswith(":")):
      result = result[1:-1]
  return result

def compute_de_identification_2006(llmodel, medical_record, word_count = 0):
  medical_record_sequences = []
  if word_count == 0 :
    medical_record_sequences.append(medical_record)
  else :
    medical_record_sequences = input_to_input_sequences(medical_record, word_count);
  tags = {}
  for medical_record_sequence in medical_record_sequences :
    medical_record_sequence_all_prompts = build_prompts_for_de_identification_2006(medical_record_sequence)
    for medical_record_sequence_prompt in medical_record_sequence_all_prompts :
      result = llmodel.compute(medical_record_sequence_prompt)
      result = remove_output_prefix_from_result(result)
      tags_parsed = parse_tags_from_string(result)
      tags = union_of_tags(tags, tags_parsed)
  return tags

def compute_de_identification_2014(llmodel, medical_record, word_count = 0):
  medical_record_sequences = []
  if word_count == 0 :
    medical_record_sequences.append(medical_record)
  else :
    medical_record_sequences = input_to_input_sequences(medical_record);
  tags = {}
  for medical_record_sequence in medical_record_sequences :
    medical_record_sequence_all_prompts = build_prompts_for_de_identification_2014(medical_record_sequence)
    for medical_record_sequence_prompt in medical_record_sequence_all_prompts :
      result = llmodel.compute(medical_record_sequence_prompt)
      result = remove_output_prefix_from_result(result)
      tags_parsed = parse_tags_from_string(result)
      tags = union_of_tags(tags, tags_parsed)
  return tags

def get_and_print_accuracy_results(predicted_tags, expected_tags) :
  tp = intersection_of_tags(predicted_tags, expected_tags)
  print("true_positives : (" + str(get_count_of_tags(tp)) + ")")
  print(tp)
  print()
  fp, tn = generate_accuracy_results(predicted_tags, expected_tags)
  print("false_positives : (" + str(get_count_of_tags(fp)) + ")")
  print(fp)
  print()
  print("true_negatives : (" + str(get_count_of_tags(tn)) + ")")
  print(tn)
  print()
  print()

# test case for the above functions using gpt3
predicted_tags = compute_de_identification_2006(gpt3model(), deid_2006_data["instances"][0]["input"], 1024)
expected_tags = parse_tags_from_tags_list(deid_2006_data["instances"][0]["output"])
get_and_print_accuracy_results(predicted_tags, expected_tags)

# test case for the above functions using gpt2
predicted_tags = compute_de_identification_2006(gpt2model(128), deid_2006_data["instances"][0]["input"], 128)
expected_tags = parse_tags_from_tags_list(deid_2006_data["instances"][0]["output"])
get_and_print_accuracy_results(predicted_tags, expected_tags)


true_positives : (8)
{'DATE': {'3/8': 1, '03/08': 1}, 'PATIENT': {'FREIERMLINKENEIGHCAABLINFARST , ANEA': 1}, 'DOCTOR': {'Cane Breutzfreierm': 1}, 'ID': {'075207': 1, '455019890': 1, '4691171': 1, '95802': 1}}

false_positives : (46)
{'Patient medical record number': {'PHI tags: MRN, Age, REGISTRATION DATE, PRELIMINARY REPORT, ED DISCHARGE NOTIFICATION, DISCHARGE STATUS, CONDITION ON DISCHARGE, DIAGNOSIS, DISPOSITION, FOLLOW UP SERVICE, INSTRUCTIONS TO PATIENT.\n\nOutput: MRN': 1}, 'Patient age': {'Age': 1}, 'Patient registration date': {'REGISTRATION DATE': 1}, 'Patient preliminary diagnosis': {'PRELIMINARY REPORT': 1}, 'Emergency Department Discharge Notification': {'ED DISCHARGE NOTIFICATION': 1}, 'Patient discharge status': {'DISCHARGE STATUS': 1}, 'Patient condition on discharge': {'CONDITION ON DISCHARGE': 1}, 'Patient diagnosis': {'DIAGNOSIS': 1}, 'Patient disposition': {'DISPOSITION': 1}, 'Follow up service for the patient': {'FOLLOW UP SERVICE': 1}, 'Instructions to the patien

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

true_positives : (0)
{}

false_positives : (9)
{'type_of_tag1': {'For this task, you are given a medical discharge record of the patient. Generate the output in this format: tag1': 1}, 'type_of_tag2': {'tag2': 1}, 'type_of_pcp1': {'Add them to the patient summary using the following format: pcp1': 1}, 'type_of_pcp2': {'pcp2': 1}, 'type_of_pcp3': {'pcp3': 1}, 'type_of_xxx': {'xxx': 1}, 'type_of_tblen': {'. These are the same tags that you have applied to the PHYSICIAN tags for the following types:\n\nRecord:\n\nCORE NAME :': 1}, 'type_of_tblend': {'CLOUDING :': 1}, 'type_of_tblintending': {'CONVENTING :': 1}}

true_negatives : (12)
{'DOCTOR': {'Breutzfreierm , Viennee': 1, 'Cane Breutzfreierm': 1}, 'ID': {'075207': 1, '9803503': 1, '4691171': 1, '95802': 1, '455019890': 1}, 'DATE': {'03/08': 1, '3/8': 1}, 'HOSPITAL': {'RWH': 1}, 'PHONE': {'254-053-7517': 1}, 'PATIENT': {'FREIERMLINKENEIGHCAABLINFARST , ANEA': 1}}


