In [1]:
import cx_Oracle
from sqlalchemy.engine import create_engine
from dotenv import load_dotenv
import os
from urllib.parse import quote_plus
import pandas as pd
import json
from datetime import datetime


In [2]:
cx_Oracle.init_oracle_client(lib_dir=r"C:\Users\zhma\instantclient_21_14")
load_dotenv(verbose=True, override=True)
engine = create_engine(f"oracle+cx_oracle://{os.getenv('I2B2STAGE_USERNAME')}:{quote_plus(os.getenv('I2B2STAGE_PASSWORD'))}@{os.getenv('I2B2STAGE_HOST')}/?service_name={os.getenv('I2B2STAGE_DATABASE')}")

In [3]:
sql_string = '''
SELECT
    "A1"."NOTE_ID"           "NOTE_ID",
    "A1"."NOTE_CSN_ID"       "NOTE_CSN_ID",
    "A1"."PAT_MRN_ID"        "PAT_MRN_ID",
    "A1"."PAT_ENC_CSN_ID"    "PAT_ENC_CSN_ID",
    "A1"."ENCOUNTER_NUM"     "ENCOUNTER_NUM",
    "A1"."EFFECTIVE_DATE_DT" "EFFECTIVE_DATE_DT",
    "A1"."MIST_PROCESSED"    "MIST_PROCESSED",
    "A1"."ENCOUNTER_TYPE"    "ENCOUNTER_TYPE",
    "A1"."NOTE_TYPE"         "NOTE_TYPE",
    "A1"."VISIT_TYPE"        "VISIT_TYPE",
    "A1"."SPECIALTY"         "SPECIALTY",
    "A1"."IMPORT_DATE"       "IMPORT_DATE",
    "A1"."NOTE_STATUS"       "NOTE_STATUS",
    "A1"."SEX"               "SEX",
    "A1"."ETHNICITY"         "ETHNICITY",
    "A1"."RACE"              "RACE",
    "A1"."UPDATE_DATE"       "UPDATE_DATE"
FROM
    "I2B2STAGE"."TEMP_PROGRESS_MIST_PROCESSED" "A1"
WHERE
    ROWNUM < 10000
'''
print(datetime.now())
dat = pd.read_sql(sql_string, con=engine)
print(datetime.now())

2024-08-20 14:14:40.646970
2024-08-20 14:15:10.462825


In [4]:
dat.columns

Index(['note_id', 'note_csn_id', 'pat_mrn_id', 'pat_enc_csn_id',
       'encounter_num', 'effective_date_dt', 'mist_processed',
       'encounter_type', 'note_type', 'visit_type', 'specialty', 'import_date',
       'note_status', 'sex', 'ethnicity', 'race', 'update_date'],
      dtype='object')

In [5]:
all_texts = [json.loads(x)['signal'] for x in dat['mist_processed']]

In [6]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
# from optimum.onnxruntime import ORTModelForTokenClassification
from transformers import TokenClassificationPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
wfumodel = './checkpoint-8600'
tokenizer = AutoTokenizer.from_pretrained(wfumodel)
tokenizer.model_max_length = 128
model = AutoModelForTokenClassification.from_pretrained(wfumodel)
# model = ORTModelForTokenClassification.from_pretrained(wfumodel, export=True, use_io_binding=False)
clf = TokenClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

In [8]:
# this is done using onnx
print(datetime.now())
results = clf(all_texts, stride=16, ignore_labels=['NORMAL'], aggregation_strategy='max', batch_size=16)
print(datetime.now())

2024-08-20 13:51:55.431019
2024-08-20 14:02:19.733149


In [8]:
# this is regular huggingface
print(datetime.now())
results = clf(all_texts, stride=16, ignore_labels=['NORMAL'], aggregation_strategy='max', batch_size=16)
print(datetime.now())

2024-08-20 14:15:24.438660


  attn_output = torch.nn.functional.scaled_dot_product_attention(


2024-08-20 14:30:20.310730


In [10]:
with open('actual_note_test.txt','w') as fid:
    fid.write('---'.join(all_texts))

In [None]:
results_unsorted = results[:]

In [None]:
all_texts_sorted = sorted(all_texts, key=lambda x: len(x))
all_texts_sorted[0], all_texts_sorted[-1]

('  ',
 'Hematology/Oncology Progress Note   LOS: 71 days   Synopsis: Mr. Triplett presented to his local ED with an elevated WBC and massive splenomegaly, 03/2010. Prior to his diagnosis,he had been experiencing recurrent URI&apos;s, rapid weight loss and anemia.   Bone marrow aspirate and biopsy revealed 17% blasts. A p210 BCR/ABL was positive at 0.909.   He was initially treated with Hydrea after admission for intractable pain. He was then started on Gleevec 400mg daily on 04/01/2010.   During this admission, it was also noted that he had significant left-sided neck and shoulder pain, the etiology of which was initially unclear. CT of the chest was obtained on 04/05/2010, revealing no evidence of PE; however, there was a concerning soft tissue density near the clavicle. An MRI revealed a lesion suspicious for osteomyelitis on his left clavicle.   CT Surgery was consulted and an FNA biopsy was obtained revealing a leukocytic infiltration with no blasts, but concern for possible absce

In [None]:
print(datetime.now())
results = clf(all_texts_sorted, stride=16, ignore_labels=['NORMAL'], aggregation_strategy='max', batch_size=16)
print(datetime.now())

2024-08-16 12:09:20.260053
2024-08-16 12:24:30.281841


In [None]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
print(datetime.now())
with torch.no_grad():
    results = clf(all_texts_sorted, stride=16, ignore_labels=['NORMAL'], aggregation_strategy='max', batch_size=16)
print(datetime.now())

2024-08-16 15:22:47.637041
2024-08-16 15:37:41.575946
