In [6]:
# !pip install py2neo

In [1]:
import pandas as pd
import sqlalchemy as sql
from py2neo import Node, Relationship, Graph, Database, Subgraph
import numpy as np

In [2]:
connect_string = 'mysql://root:admin123@localhost/umls'

In [3]:
sql_engine = sql.create_engine(connect_string)

In [4]:
default_db = Database(password="123456")
print(default_db.name)

graph.db


In [5]:
graph = Graph(user="neo4j", password="123456")

## Concepts

In [21]:
query = "SELECT * FROM MRCONSO WHERE SAB ='SNOMEDCT_US' and TTY='FN'"
con = pd.read_sql_query(query, sql_engine)

In [22]:
con.head()

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,C0319824,ENG,S,L2931276,PF,S3204998,Y,A10868589,2615359012,420643001,,SNOMEDCT_US,FN,420643001,Caloscypha fulgens (organism),9,N,
1,C0006864,ENG,S,L2932443,PF,S3205841,Y,A10868616,2615311012,421921003,,SNOMEDCT_US,FN,421921003,Cannabinoid (substance),9,N,
2,C0201945,ENG,S,L2949850,PF,S3213573,Y,A10868693,2612635015,77161005,,SNOMEDCT_US,FN,77161005,Cerebrospinal fluid protein electrophoresis (p...,9,N,
3,C0013336,ENG,S,L2973626,PF,S3235999,Y,A10868877,2615646014,422065006,,SNOMEDCT_US,FN,422065006,Constitutional short stature (disorder),9,N,2048.0
4,C0600203,ENG,S,L2792355,PF,S3257200,Y,A10869342,2615319014,421675006,,SNOMEDCT_US,FN,421675006,Dichloroacetic acid (substance),9,N,


In [23]:
con['STT'].value_counts()

PF     346415
VO        456
VC         53
VCW        16
VW         10
Name: STT, dtype: int64

In [24]:
con['STYPE'] = con['STR'].apply(lambda x: x[x.index('(')+1: x.index(')')])

In [25]:
con.shape

(346950, 19)

In [26]:
con.to_csv('data/concepts.csv', index=None)

### Concept nodes

In [22]:
g = con.groupby('CUI')['STR'].apply(lambda x: ' | '.join(list(np.unique(x))))

In [23]:
cui_df = g.reset_index()

In [24]:
cui_df.head()

Unnamed: 0,CUI,STR
0,C0000052,"1,4-alpha-Glucan branching enzyme (substance)"
1,C0000097,Methylphenyltetrahydropyridine (substance)
2,C0000102,1-Naththylamine (substance)
3,C0000163,17-Hydroxycorticosteroid (substance)
4,C0000167,17-Ketosteroid (substance)


In [25]:
%%time
cuis = {}
# batch_size = 200
for index, row in cui_df.iterrows():
    cui_name = str(row['CUI'])
    props = {'STR': row['STR']}
    cui = {'name': cui_name, 'props': props}
    cuis[cui_name] = (cui)

CPU times: user 53.5 s, sys: 152 ms, total: 53.6 s
Wall time: 53.7 s


In [26]:
%%time
graph.schema.create_uniqueness_constraint('CUI', 'name')
statement = '''
UNWIND {batches} as batch
MERGE(a:CUI {name:batch.name})
ON CREATE SET a+=batch.props
'''
result = graph.run(statement, batches=list(cuis.values()))

CPU times: user 8.52 s, sys: 79.7 ms, total: 8.6 s
Wall time: 32.2 s


### Sub-concept nodes

In [27]:
%%time
graph.schema.create_uniqueness_constraint('SCUI', 'name')
scuis = {}

for index, row in con.iterrows():
    scui_name = str(row['SCUI'])
    if scui_name not in scuis:
        props = {'STR': row['STR'], 'STYPE': row['STYPE'], 'SAB': row['SAB']}
        scui = {'name': scui_name, 'props': props}
        scuis[scui_name] = (scui)

CPU times: user 1min 4s, sys: 196 ms, total: 1min 4s
Wall time: 1min 4s


In [28]:
%%time
statement = '''
UNWIND {batches} as batch
MERGE(a:SCUI {name:batch.name})
ON CREATE SET a+=batch.props
'''
result = graph.run(statement, batches=list(scuis.values()))

CPU times: user 11.7 s, sys: 104 ms, total: 11.8 s
Wall time: 41.3 s


### Sub-concept --> concepts (1-->n)

In [29]:
%%time
# SCUI-->CUI is a 1:n relation
rels = []
statement_relations = '''

UNWIND {batches} as batch
MATCH (u:SCUI {name:batch.scui}), (r:CUI {name:batch.cui})
MERGE (u)-[:scui_to_cui]->(r)
'''
# MERGE (r)-[:cui_to_scui]->(u)
batch_size = 2000
for index, row in con.iterrows(): 
    rels.append({'scui': str(row['SCUI']), 'cui': row['CUI']})
    
    if index%batch_size == 0:        
        result = graph.run(statement_relations, batches=rels)
        rels = []
result = graph.run(statement_relations, batches=rels)

CPU times: user 1min 1s, sys: 72.5 ms, total: 1min 1s
Wall time: 1min 36s


## Concept --> Concept (n-->n)

In [6]:
# query = "select CUI1, AUI1, REL, CUI2, AUI2, RELA, RG from umls.MRREL where SAB ='SNOMEDCT_US' AND DIR='Y' AND SUPPRESS='N'"
query = "select CUI1, AUI1, REL, CUI2, AUI2, RELA, RG from umls.MRREL where SAB ='SNOMEDCT_US' AND DIR='Y' "
rel = pd.read_sql_query(query, sql_engine)

In [None]:
rel.shape

In [7]:
rel = rel.drop_duplicates(['CUI1', 'CUI2', 'RELA'])

In [8]:
rel.shape

(2239172, 7)

In [9]:
rel['RELA'].value_counts()

isa                          849843
has_finding_site             209666
has_method                   123673
mapped_to                    113222
has_associated_morphology    109846
                              ...  
temporally_related_to             7
has_dependent                     6
moved_from                        5
relative_to_part_of               2
has_inherent_location             1
Name: RELA, Length: 122, dtype: int64

In [10]:
rel = rel[rel['CUI1'] != rel['CUI2']]

In [12]:
rel.shape

(2167441, 7)

In [27]:
rel.to_csv('data/relations.csv', index=None)

In [13]:
%%time
# CUI-->CUI is a n:n relation
rels = []
statement_relations = '''
UNWIND {batches} as batch
MATCH (u:CUI {name:batch.cui1}), (r:CUI {name:batch.cui2})
MERGE (u)-[rel1:cui_to_cui]->(r)
ON CREATE SET rel1 += batch.rel1_props
'''
# MERGE (r)-[rel2:cui_to_cui]->(u)
# ON CREATE SET rel2 += batch.rel2_props

batch_size = 2000
for index, row in rel.iterrows():    
    rels.append({'cui1': row['CUI1'], 'cui2': row['CUI2'], 
                 'rel1_props': {'RELA': row['RELA']}
#                  'rel2_props': {'RELA': 'invert_'+row['RELA'], 'Invented': 1}
                })
    
    if index%batch_size == 0: 
        result = graph.run(statement_relations, batches=rels)
        rels = []
result = graph.run(statement_relations, batches=rels)

CPU times: user 7min 21s, sys: 242 ms, total: 7min 22s
Wall time: 9min 47s


## Semantic types

In [14]:
query = """
select * from MRSTY where CUI in 
(select distinct CUI from MRCONSO WHERE SAB ='SNOMEDCT_US')
"""
sem = pd.read_sql_query(query, sql_engine)

In [15]:
sem.shape

(414636, 6)

In [28]:
sem.to_csv('data/semantype.csv', index=None)

### TUI nodes

In [16]:
tuis_df = sem.drop_duplicates(['TUI', 'STN', 'STY'])

In [17]:
%%time
graph.schema.create_uniqueness_constraint('TUI', 'name')
tuis = {}

for index, row in tuis_df.iterrows():
    tui_name = row['TUI']
    props = {'STY': row['STY'], 'STN': row['STN']}
    tui = {'name': tui_name, 'props': props}
    tuis[tui_name] = (tui)

CPU times: user 26.4 ms, sys: 9 µs, total: 26.5 ms
Wall time: 305 ms


In [18]:
%%time
# CUI --> TUI is a n:n relation
statement = '''
UNWIND {batches} as batch
MERGE(a:TUI {name:batch.name})
ON CREATE SET a+=batch.props
'''
result = graph.run(statement, batches=list(tuis.values()))

CPU times: user 5.94 ms, sys: 8 µs, total: 5.94 ms
Wall time: 280 ms


### Concept --> Semantic types (n--n)

In [19]:
%%time
rels = []
statement_relations = '''
UNWIND {batches} as batch
MATCH (u:CUI {name:batch.cui}), (r:TUI {name:batch.tui})
MERGE (u)-[:cui_to_tui]->(r)
'''
# MERGE (r)-[:tui_to_cui]->(u)

batch_size = 2000
for index, row in sem.iterrows():    
    rels.append({'cui': row['CUI'], 'tui': row['TUI'], 
                 })
    
    if index%batch_size == 0: 
        result = graph.run(statement_relations, batches=rels)
        rels = []
result = graph.run(statement_relations, batches=rels)

CPU times: user 1min 10s, sys: 44 ms, total: 1min 11s
Wall time: 1min 51s


In [35]:
import operator as op
from functools import reduce

In [36]:
def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer / denom

In [37]:
ncr(10,3)

120.0

In [50]:
p = 0.15
n = 100
pd = 15
sump = 0
for k in range(n+1):
    sump += ncr(n, k) * pow(p, k) * pow(1-p, n-k) * pd

In [51]:
sump

14.99999999999996