## NOTE: using the default "Python 3" Kernel to run Scispacy

In [1]:
# !pip install spacy-transformers

In [2]:
# !pip install scispacy

In [3]:
# ! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [4]:
# ! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz

In [5]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz

In [6]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz

In [7]:
# !pip install seaborn

In [17]:
# !python

In [1]:
import scispacy
import spacy
import pandas as pd
import seaborn as sns

In [2]:
from scispacy.linking import EntityLinker

## Load model

In [3]:
nlp = spacy.load("en_core_sci_scibert")
# nlp = spacy.load("en_core_sci_lg")
# nlp = spacy.load("en_core_sci_md")
# nlp = spacy.load("en_core_sci_sm")

In [4]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "name": "umls"
#                                         "threshold": 0.85
                                       })



<scispacy.linking.EntityLinker at 0x7f5b1eb7b2d0>

### Test example

In [5]:
# Replace text without your own data
text = '''
This is a 54 year old woman with PMH including COPD with 2L home 02,
   recent PNA with PO abx regimen, also morbidly obese. Presents to ED
   after family noticed mental status changes. Patient was found to be
   rhoncorous bilaterally with O2 sats in 70's. 102.6 PO, c/o HA, placed
   on NRB and subsequently intubated, with difficulty. S/P intubation,
   patient became hypotensive with SBP in 80's. 5L fluid given, arterial
   line and presep catheter placed. Placed on scant amount of Levophed
   with SBP responding into 120's. ABG signficant for respiratory acidosis
   with signficant hypercarbia. CXR showed RLL and LLL consolidation.
   While in ED, received 1 gm Vancomycin, 1 gm Ceftriaxone, and 750mg
   Levaquin. Transferred to [**Hospital Ward Name 51**] MICU for further management of PNA,
   sepsis, and acidosis.

'''

doc = nlp(text)

In [6]:
linker = nlp.get_pipe("scispacy_linker")

In [7]:
for ent in doc.ents:
    print('---------------------------------')
    print("Span: ", ent.start, ent.end, ent.start_char, ent.end_char)
    print("String: ", ent)
#     print(ent._.kb_ents)
    for umls_ent in ent._.kb_ents[:1]:
        print("Top entity per mention: ", umls_ent[0], " score: ", umls_ent[1])
        print('====================================')
        print(linker.kb.cui_to_entity[umls_ent[0]])

---------------------------------
Span:  5 6 14 18
String:  year
Top entity per mention:  C0439234  score:  1.0
CUI: C0439234, Name: year
Definition: A period of time that it takes for Earth to make a complete revolution around the sun, approximately 365 days; a specific one year period.
TUI(s): T079
Aliases: (total: 9): 
	 a, year, YYYY, YEARS, year (qualifier value), yr, years, Annum, Year
---------------------------------
Span:  7 8 23 28
String:  woman
Top entity per mention:  C0043210  score:  0.9999999403953552
CUI: C0043210, Name: Woman
Definition: Human females as cultural, psychological, sociological, political, and economic entities.
TUI(s): T098
Aliases (abbreviated, total: 14): 
	 female humans, Girl, Human Females, Girls, Woman, women, human female, female, woman, Women
---------------------------------
Span:  9 10 34 37
String:  PMH
Top entity per mention:  C0262926  score:  1.0
CUI: C0262926, Name: Medical History
Definition: A collection of information about a person's 

## Load data

In [15]:
df = pd.read_csv("data/test_sample.csv")

In [16]:
df.head()

Unnamed: 0,row_id,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,gender,dob,dod,dod_hosp,dod_ssn,expire_flag,etl_date
0,674863,83982,147681.0,2145-04-02 00:00:00,2145-04-02 02:08:00,2145-04-02 04:31:23,Nursing,Nursing Progress Note,16436,,"TITLE:\n This is a 55 y/o M, with H/O Hep C ...",M,2089-10-23 00:00:00,2145-05-04 00:00:00,2145-05-04 00:00:00,2145-05-04 00:00:00,1,2001-01-15 12:21:13
1,674864,50743,109381.0,2134-05-07 00:00:00,2134-05-07 02:19:00,2134-05-07 04:36:56,Nursing,Nursing Progress Note,20088,,76 y/o M initially admitted to [**Hospital3 33...,M,2058-02-27 00:00:00,2134-06-06 00:00:00,2134-06-06 00:00:00,2134-06-06 00:00:00,1,2001-01-15 12:21:13
2,674865,96699,186382.0,2132-06-13 00:00:00,2132-06-13 04:36:00,2132-06-13 04:37:01,Respiratory,Respiratory Care Shift Note,15498,,Demographics\n Day of intubation:\n Day of...,M,2079-01-01 00:00:00,2132-06-14 00:00:00,2132-06-14 00:00:00,,1,2001-01-15 12:21:13
3,674925,96699,186382.0,2132-06-13 00:00:00,2132-06-13 06:37:00,2132-06-13 06:37:39,Nursing,Nursing Progress Note,16436,,TITLE:\n Impaired Skin Integrity\n Assessm...,M,2079-01-01 00:00:00,2132-06-14 00:00:00,2132-06-14 00:00:00,,1,2001-01-15 12:21:13
4,675045,75395,126239.0,2181-04-27 00:00:00,2181-04-27 08:01:00,2181-04-27 13:08:08,Nursing,Nursing Transfer Note,15659,,"54yr man with hx metastatic renal cell ca, c/o...",M,2126-07-20 00:00:00,2181-05-22 00:00:00,2181-05-22 00:00:00,2181-05-22 00:00:00,1,2001-01-15 12:21:13


## Annotate data

In [None]:
print("Span: ", ent.start, ent.end)
print("String: ", ent)

In [17]:
docs = {}
for name, text in df[['row_id', 'text']].values:
    concepts = []
    doc = nlp(text)
    for ent in doc.ents:
        
        for umls_ent in ent._.kb_ents[:1]:
            concepts.append(umls_ent[0])
#             print("Top entity per mention: ", umls_ent[0], " score: ", umls_ent[1])
    #         print(linker.kb.cui_to_entity[umls_ent[0]])
    docs[name] = dict( [ (i, concepts.count(i)) for i in set(concepts) ] )

In [13]:
# docs

## Annotation pipeline

Read notes from postgres database on 10.200.106.114 and store annotations to Elasticsearch 10.200.112.204

In [5]:
from sqlalchemy import create_engine
sql_engine = create_engine('postgresql://ckg:Admin123@10.200.106.114:5432/mimic')

In [6]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elastic_util import * 

In [None]:
batch = 10000
for i in range(batch):
    print(i)
    sql_query = 'select * from noteevents_new_date where row_id%' + str(10000) + '=' + str(i)
    df = pd.read_sql_query(sql_query, con=sql_engine)
    dlist = []
    for row_id, subject_id, text in df[['row_id', 'subject_id', 'text']].values:
        doc = nlp(text)
        for ent in doc.ents:
    #         print(ent.start_char, ent.start)
            for umls_ent in ent._.kb_ents[:1]:
                dlist.append([row_id, subject_id, ent.start_char, ent.end_char, ent.text, umls_ent[0], umls_ent[1]])
    data = pd.DataFrame(dlist, columns=['row_id', 'subject_id', 'start_char', 'end_char', 'text', 'cui', 'score'])    
    data['ann_id'] = data['row_id'].astype(str) + data["start_char"].astype(str)
    
    
    es_user = 'elastic'
    es_pass = 'Admin2019' 
    es_ip = '10.200.112.204'
    es_port = '9200'

    client = Elasticsearch(['http://%s:%s@%s:%s' % (es_user, es_pass, es_ip, es_port)], 
                           request_timeout=6000, retry_on_timeout=True)
    
    doc_type = '_doc'
    doc_index = 'ann_id'
    index='mimic_scispacy'
    
    result = bulk_insert(client, data, index, doc_type, doc_index)
    if result == False:
        print('Current batch:  ', i)
        break

0




1




2




3




4




5




6




7




8




9




10




11




12




13




14




15




In [83]:
es_user = 'elastic'
es_pass = 'Admin2019' 
es_ip = '10.200.112.204'
es_port = '9200'

client = Elasticsearch(['http://%s:%s@%s:%s' % (es_user, es_pass, es_ip, es_port)], 
                       request_timeout=6000, retry_on_timeout=True)

In [84]:
doc_type = '_doc'
doc_index = 'ann_id'
index='mimic_scispacy'

In [85]:
bulk_insert(client, data, index, doc_type, doc_index)



True

In [46]:
sql_query = 'select count(*), count(distinct row_id) from noteevents_new_date'

df = pd.read_sql_query(sql_query,con=sql_engine)

In [47]:
df.head()

Unnamed: 0,count,count.1
0,2083180,2083180


In [48]:
# sns.displot(df['row_id'])

In [49]:
df.shape

(1, 2)