## NOTE: using the default "Python 3" Kernel to run Scispacy

In [1]:
# !pip install scispacy

In [2]:
# ! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [3]:
# ! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz

In [4]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz

In [5]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz

In [1]:
import scispacy
import spacy
import pandas as pd

In [2]:
from scispacy.linking import EntityLinker

## Load model

In [3]:
nlp = spacy.load("en_core_sci_scibert")
# nlp = spacy.load("en_core_sci_lg")
# nlp = spacy.load("en_core_sci_md")
# nlp = spacy.load("en_core_sci_sm")

In [4]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "name": "umls"
#                                         "threshold": 0.85
                                       })



<scispacy.linking.EntityLinker at 0x7f7852a3ea00>

### Test example

In [5]:
# Replace text without your own data
text = "Spinal and bulbar muscular atrophy (SBMA) is an \
       inherited motor neuron disease caused by the expansion \
       of a polyglutamine tract within the androgen receptor (AR). \
       SBMA can be caused by this easily."

doc = nlp(text)

In [10]:
linker = nlp.get_pipe("scispacy_linker")

In [14]:
for ent in doc.ents:
    print('---------------------------------')
    print("Span: ", ent.start, ent.end)
    print("String: ", ent)
#     print(ent._.kb_ents)
    for umls_ent in ent._.kb_ents[:1]:
        print("Top entity per mention: ", umls_ent[0], " score: ", umls_ent[1])
#         print(linker.kb.cui_to_entity[umls_ent[0]])

---------------------------------
Span:  0 1
String:  Spinal
Top entity per mention:  C0521329  score:  1.0
---------------------------------
Span:  2 5
String:  bulbar muscular atrophy
Top entity per mention:  C1839259  score:  0.9092331528663635
---------------------------------
Span:  6 7
String:  SBMA
Top entity per mention:  C1705240  score:  0.9999998807907104
---------------------------------
Span:  12 15
String:  motor neuron disease
Top entity per mention:  C0085084  score:  1.0
---------------------------------
Span:  18 19
String:  expansion
Top entity per mention:  C0007595  score:  0.8664658665657043
---------------------------------
Span:  22 24
String:  polyglutamine tract
Top entity per mention:  C0032500  score:  0.7472081780433655
---------------------------------
Span:  26 28
String:  androgen receptor
Top entity per mention:  C0034786  score:  1.0
---------------------------------
Span:  29 30
String:  AR
Top entity per mention:  C0003504  score:  1.0
--------------

## Load data

In [15]:
df = pd.read_csv("data/test_sample.csv")

In [16]:
df.head()

Unnamed: 0,row_id,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,gender,dob,dod,dod_hosp,dod_ssn,expire_flag,etl_date
0,674863,83982,147681.0,2145-04-02 00:00:00,2145-04-02 02:08:00,2145-04-02 04:31:23,Nursing,Nursing Progress Note,16436,,"TITLE:\n This is a 55 y/o M, with H/O Hep C ...",M,2089-10-23 00:00:00,2145-05-04 00:00:00,2145-05-04 00:00:00,2145-05-04 00:00:00,1,2001-01-15 12:21:13
1,674864,50743,109381.0,2134-05-07 00:00:00,2134-05-07 02:19:00,2134-05-07 04:36:56,Nursing,Nursing Progress Note,20088,,76 y/o M initially admitted to [**Hospital3 33...,M,2058-02-27 00:00:00,2134-06-06 00:00:00,2134-06-06 00:00:00,2134-06-06 00:00:00,1,2001-01-15 12:21:13
2,674865,96699,186382.0,2132-06-13 00:00:00,2132-06-13 04:36:00,2132-06-13 04:37:01,Respiratory,Respiratory Care Shift Note,15498,,Demographics\n Day of intubation:\n Day of...,M,2079-01-01 00:00:00,2132-06-14 00:00:00,2132-06-14 00:00:00,,1,2001-01-15 12:21:13
3,674925,96699,186382.0,2132-06-13 00:00:00,2132-06-13 06:37:00,2132-06-13 06:37:39,Nursing,Nursing Progress Note,16436,,TITLE:\n Impaired Skin Integrity\n Assessm...,M,2079-01-01 00:00:00,2132-06-14 00:00:00,2132-06-14 00:00:00,,1,2001-01-15 12:21:13
4,675045,75395,126239.0,2181-04-27 00:00:00,2181-04-27 08:01:00,2181-04-27 13:08:08,Nursing,Nursing Transfer Note,15659,,"54yr man with hx metastatic renal cell ca, c/o...",M,2126-07-20 00:00:00,2181-05-22 00:00:00,2181-05-22 00:00:00,2181-05-22 00:00:00,1,2001-01-15 12:21:13


## Annotate data

In [17]:
docs = {}
for name, text in df[['row_id', 'text']].values:
    concepts = []
    doc = nlp(text)
    for ent in doc.ents:
        for umls_ent in ent._.kb_ents[:1]:
            concepts.append(umls_ent[0])
#             print("Top entity per mention: ", umls_ent[0], " score: ", umls_ent[1])
    #         print(linker.kb.cui_to_entity[umls_ent[0]])
    docs[name] = dict( [ (i, concepts.count(i)) for i in set(concepts) ] )

In [13]:
# docs

In [17]:
# !pip install psycopg2

In [15]:
from sqlalchemy import create_engine
sql_engine = create_engine('postgresql://ckg:Admin123@10.200.106.114:5432/mimic')