In [1]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from utils import PreProcess
import datasets
from torch.utils.data import DataLoader
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wfumodel = './checkpoint-8290'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(wfumodel)
tokenizer.model_max_length = 128
model = AutoModelForTokenClassification.from_pretrained(wfumodel)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [4]:
text = '''Atrium Health Carolinas Medical Center (CMC) is the flagship hospital of Atrium Health, which is distinguished throughout the Southeastern United States for its excellent patient care and medical expertise.
CMC operates at 2 locations: CMC and Atrium Health Mercy. These locations are 1.3 miles apart.
From modest beginnings in 1943, we have evolved into the largest hospital in the region and a world-class facility that offers a full range of services to the Charlotte community and beyond, with more than 1,100 physicians and providers specializing in all areas of medicine.
CMC serves as the region’s only Level 1 trauma center and is an approved transplant center for heart, kidney, pancreas and liver. We also serve as one of North Carolina’s 5 Academic Medical Center Teaching Hospitals, providing residency training for more than 200 physicians in 15 specialties and serve as a regional campus for Wake Forest University School of Medicine, based in Winston-Salem, NC.
Carolinas Medical Center has been named the number 1 Best Hospital in the Charlotte region by U.S. News & World Report for 7 years in a row. Also located at CMC is Levine Cancer Institute's academic and research headquarters, Carolinas Rehabilitation, ranked #1 in the Southeast and top 10 in the nation, and Levine Children's Hospital, consistently ranked as a Best Children's Hospital in multiple specialties by U.S. News & World Report for 16 years in a row.
Carolinas Medical Center and its outpatient clinics, as well as Levine Children’s Hospital, Levine Cancer Institute and Atrium Health Mercy, are Magnet designated by the American Nurses Credentialing Center’s Magnet Recognition Program®.'''

In [14]:
inputs = tokenizer(text, truncation=True, return_overflowing_tokens=True, return_tensors='pt')
inputs.pop('overflow_to_sample_mapping')
inputs

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
tokenizer.model_max_length

In [15]:
from transformers import TokenClassificationPipeline

clf = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
# clf._sanitize_parameters(stride=10)

In [16]:
clf(text, stride=16, ignore_labels=['LABEL_12'], aggregation_strategy='max')

[{'entity_group': 'LABEL_3',
  'score': 0.50738233,
  'word': 'atrium health',
  'start': 73,
  'end': 86},
 {'entity_group': 'LABEL_3',
  'score': 0.9105699,
  'word': 'atrium health mercy',
  'start': 244,
  'end': 263},
 {'entity_group': 'LABEL_1',
  'score': 0.9588765,
  'word': '1943',
  'start': 328,
  'end': 332},
 {'entity_group': 'LABEL_3',
  'score': 0.4075527,
  'word': 'charlotte',
  'start': 461,
  'end': 470},
 {'entity_group': 'LABEL_7',
  'score': 0.7204126,
  'word': 'north carolina',
  'start': 732,
  'end': 746},
 {'entity_group': 'LABEL_3',
  'score': 0.7648039,
  'word': 'academic medical center',
  'start': 751,
  'end': 774},
 {'entity_group': 'LABEL_7',
  'score': 0.53567636,
  'word': 'wake',
  'start': 906,
  'end': 910},
 {'entity_group': 'LABEL_3',
  'score': 0.720823,
  'word': 'forest university school of medicine',
  'start': 911,
  'end': 947},
 {'entity_group': 'LABEL_7',
  'score': 0.9862631,
  'word': 'winston - salem, nc',
  'start': 958,
  'end': 97

In [None]:
wfu_dataset = datasets.load_dataset('wfudata', trust_remote_code=True)

In [None]:
example = wfu_dataset['train'][0]
example

In [None]:
clf(example['text'], stride=16, ignore_labels=['LABEL_12'], aggregation_strategy='max')

In [None]:
examples = wfu_dataset['train'][:128]

In [None]:
results = clf(examples['text'], stride=16, ignore_labels=['LABEL_12'], aggregation_strategy='max')

In [None]:
results[1]

In [None]:
for x in results[1]:
    if x['entity_group'] == 'LABEL_8':
        print(x['word'])

In [None]:
examples['text'][1]

In [None]:
from itertools import cycle

fake_names = ['Boris Hughes',
'Boris Davies',
'Kylie Rees',
'Benjamin Sutherland',
'Stewart Bell',
'Boris Gill',
'Mary Hamilton',
'Jonathan Terry',
'Steven Black',
'Felicity Rees']

fake_names = cycle(fake_names)

In [None]:
next(fake_names)

In [None]:
def replace_name(text, result):
    all_names = [x['word'] for x in result if x['entity_group'] == 'LABEL_8']
    all_names = set(all_names)
    mapping = {}
    for name in all_names:
        mapping[name] = next(fake_names)
    new_text = ''
    prev_end = 0
    for x in result:
        if x['entity_group'] == 'LABEL_8':
            start, end = x['start'], x['end']
            fake_name = mapping[x['word']]
            new_text = new_text + text[prev_end:end] + '[' + fake_name + ']'
            prev_end = end
    return new_text

In [65]:
print(replace_name(examples['text'][1], results[1]))

Referring MD: Eagle Fam Medicine[Boris Gill], Bra*   PCP: Aaron Stanford Morrow[Jonathan Terry], MD   Identification: 1711219  Chief Complaint:   Chief Complaint   Patient presents with    Follow-up        PATIENT SUMMARY:  Susan Jane Isbell[Steven Black] is a 63 y.o. female with the onset of a left hand resting tremor in 2011.     She was first seen here in October of 2012. Her exam showed a left hand resting tremor, bradykinesia and rigidity worse on the left. Her symptoms were consistent with idiopathic parkinson&apos;s disease. After some discussion of her treatment options, she said she was primarily interested in improving her tremor. She started artane, which was titrated up to 2 mg twice daily. At first f/u she reported tremor has improved 85 percent since beginning Artane. Reported most concerning symptoms as cognitive. Has difficulty with word finding, more difficulty doing crossword puzzles and difficulty mult-tasking. She had always functioned at a high level and the change

In [67]:
print(examples['text'][1])

Referring MD: Eagle Fam Medicine, Bra*   PCP: Aaron Stanford Morrow, MD   Identification: 1711219  Chief Complaint:   Chief Complaint   Patient presents with    Follow-up        PATIENT SUMMARY:  Susan Jane Isbell is a 63 y.o. female with the onset of a left hand resting tremor in 2011.     She was first seen here in October of 2012. Her exam showed a left hand resting tremor, bradykinesia and rigidity worse on the left. Her symptoms were consistent with idiopathic parkinson&apos;s disease. After some discussion of her treatment options, she said she was primarily interested in improving her tremor. She started artane, which was titrated up to 2 mg twice daily. At first f/u she reported tremor has improved 85 percent since beginning Artane. Reported most concerning symptoms as cognitive. Has difficulty with word finding, more difficulty doing crossword puzzles and difficulty mult-tasking. She had always functioned at a high level and the change was distressing to her.     She had been 

In [1]:
result

NameError: name 'result' is not defined