In [1]:
# !pip install py2neo==4.3.0

In [40]:
# !pip install neo4j

In [53]:
import pandas as pd
import sqlalchemy as sql
from py2neo import Node, Relationship, Graph, Subgraph
import numpy as np
from neo4j import GraphDatabase

In [15]:
# !pip show py2neo

In [54]:
connect_string = 'mysql+pymysql://ckg:Admin123@10.200.106.114/umls'
# 10.200.106.114

In [55]:
sql_engine = sql.create_engine(connect_string)

In [18]:
# default_db = Database(password="123456")
# print(default_db.name)

In [19]:
# authenticate("http://10.200.112.233:7474", "neo4j", "NeO4J")

In [56]:
# graph = Graph(user="neo4j", password="NeO4J")
# graph = Graph("http://10.200.112.233:7474", user="neo4j", password="NeO4J")
graph = Graph("http://10.200.112.233:7474/db/data/", user="neo4j", password="NeO4J")
# user="wt", password="Admin123")
# graph = Graph("http://10.200.112.233:7474/db/data/")

In [57]:
# uri = 'bolt://10.200.112.233:7687/db/data/'
# user="neo4j" 
# password="NeO4J"
# graph = GraphDatabase.driver(uri, auth=(user, password))

## Concepts

In [58]:
con = pd.read_csv('data/concepts.csv')

### Concept nodes

In [59]:
g = con.groupby('CUI')['STR'].apply(lambda x: ' | '.join(list(np.unique(x))))

In [23]:
cui_df = g.reset_index()

In [24]:
cui_df.sample(10)

Unnamed: 0,CUI,STR
103350,C0399945,Open biopsy of lesion of ileum (procedure)
204337,C1026189,Desulfocella halophila (organism)
14116,C0062313,Hemoglobin G-Philadelphia (substance)
25628,C0188246,Equalization of leg by fibula (procedure)
151601,C0474233,Antiamebic drug prophylaxis (procedure)
26318,C0189403,Bronchoscopy through tracheostomy with biopsy ...
135882,C0443344,Valgus (qualifier value)
127022,C0432651,Blood group antibody Fy^b^ (substance)
211890,C1266299,"2,4-Dinitrophenylhydrazine measurement (proced..."
200791,C0978384,Product containing precisely meloxicam 7.5 mil...


In [34]:
cui_df.to_csv('data/umls_concepts.csv', index=None)

In [26]:
# def path2url(path):
#     return urlparse.urljoin('file:', urllib.pathname2url(path))

In [49]:
# nodePath = 'data/umls_concepts.csv'
# # nodename = 'Concept'

In [37]:
# graph.cypher.execute("LOAD CSV WITH HEADERS FROM '%s' AS csvLine " % (nodePath) +
#                      " CREATE (p:"+nodename+" { id: csvLine.CUI, name: csvLine.STR })")

In [50]:
CREATE CONSTRAINT ON (c:CUI) ASSERT c.CUI IS UNIQUE;
USING PERIODIC COMMIT 500
LOAD csv with headers from "umls_concepts.csv" as line
with line
CREATE (:ObjectConcept
        { nodetype:           'concept',
          CUI:                 line.CUI,
          STR:                 line.STR
          } );

In [30]:
%%time
cuis = {}
# batch_size = 200
for index, row in cui_df.iterrows():
    cui_name = str(row['CUI'])
    props = {'STR': row['STR']}
    cui = {'labels': cui_name, 'properties': props}
    cuis[cui_name] = (cui)

CPU times: user 34.2 s, sys: 152 ms, total: 34.3 s
Wall time: 34.3 s


In [60]:
list(cuis.values())[:5]

[{'labels': 'C0000052',
  'properties': {'STR': '1,4-alpha-Glucan branching enzyme (substance)'}},
 {'labels': 'C0000097',
  'properties': {'STR': 'Methylphenyltetrahydropyridine (substance)'}},
 {'labels': 'C0000102', 'properties': {'STR': '1-Naththylamine (substance)'}},
 {'labels': 'C0000163',
  'properties': {'STR': '17-Hydroxycorticosteroid (substance)'}},
 {'labels': 'C0000167', 'properties': {'STR': '17-Ketosteroid (substance)'}}]

In [61]:
list(cuis.keys())[:3]

['C0000052', 'C0000097', 'C0000102']

In [None]:
# # %%time
# # graph.schema.create_uniqueness_constraint('CUI', 'name')
statement = '''
UNWIND {batches} as batch
MERGE(a:CUI {labels:batch.labels})
ON CREATE SET a+=batch.properties
'''
result = graph.run(statement, batches=list(cuis.values()))

### Sub-concept nodes

In [None]:
%%time
# graph.schema.create_uniqueness_constraint('SCUI', 'name')
scuis = {}

for index, row in con.iterrows():
    scui_name = str(row['SCUI'])
    if scui_name not in scuis:
        props = {'STR': row['STR'], 'STYPE': row['STYPE'], 'SAB': row['SAB']}
        scui = {'name': scui_name, 'props': props}
        scuis[scui_name] = (scui)

In [None]:
%%time
statement = '''
UNWIND {batches} as batch
MERGE(a:SCUI {name:batch.name})
ON CREATE SET a+=batch.props
'''
result = graph.run(statement, batches=list(scuis.values()))

### Sub-concept --> concepts (1-->n)

In [None]:
%%time
# SCUI-->CUI is a 1:n relation
rels = []
statement_relations = '''

UNWIND {batches} as batch
MATCH (u:SCUI {name:batch.scui}), (r:CUI {name:batch.cui})
MERGE (u)-[:scui_to_cui]->(r)
'''
# MERGE (r)-[:cui_to_scui]->(u)
batch_size = 2000
for index, row in con.iterrows(): 
    rels.append({'scui': str(row['SCUI']), 'cui': row['CUI']})
    
    if index%batch_size == 0:        
        result = graph.run(statement_relations, batches=rels)
        rels = []
result = graph.run(statement_relations, batches=rels)

## Concept --> Concept (n-->n)

In [None]:
rel = pd.read_csv('data/relations.csv')

In [None]:
rel = rel[rel['CUI1'] != rel['CUI2']]

In [None]:
%%time
# CUI-->CUI is a n:n relation
rels = []
statement_relations = '''
UNWIND {batches} as batch
MATCH (u:CUI {name:batch.cui1}), (r:CUI {name:batch.cui2})
MERGE (u)-[rel1:cui_to_cui]->(r)
ON CREATE SET rel1 += batch.rel1_props
'''
# MERGE (r)-[rel2:cui_to_cui]->(u)
# ON CREATE SET rel2 += batch.rel2_props

batch_size = 2000
for index, row in rel.iterrows():    
    rels.append({'cui1': row['CUI1'], 'cui2': row['CUI2'], 
                 'rel1_props': {'RELA': row['RELA']}
#                  'rel2_props': {'RELA': 'invert_'+row['RELA'], 'Invented': 1}
                })
    
    if index%batch_size == 0: 
        result = graph.run(statement_relations, batches=rels)
        rels = []
result = graph.run(statement_relations, batches=rels)

## Semantic types

In [None]:
sem = pd.read_csv('data/semantype.csv')

### TUI nodes

In [None]:
tuis_df = sem.drop_duplicates(['TUI', 'STN', 'STY'])

In [None]:
%%time
# graph.schema.create_uniqueness_constraint('TUI', 'name')
tuis = {}

for index, row in tuis_df.iterrows():
    tui_name = row['TUI']
    props = {'STY': row['STY'], 'STN': row['STN']}
    tui = {'name': tui_name, 'props': props}
    tuis[tui_name] = (tui)

In [None]:
%%time
# CUI --> TUI is a n:n relation
statement = '''
UNWIND {batches} as batch
MERGE(a:TUI {name:batch.name})
ON CREATE SET a+=batch.props
'''
result = graph.run(statement, batches=list(tuis.values()))

### Concept --> Semantic types (n--n)

In [None]:
%%time
rels = []
statement_relations = '''
UNWIND {batches} as batch
MATCH (u:CUI {name:batch.cui}), (r:TUI {name:batch.tui})
MERGE (u)-[:cui_to_tui]->(r)
'''
# MERGE (r)-[:tui_to_cui]->(u)

batch_size = 2000
for index, row in sem.iterrows():    
    rels.append({'cui': row['CUI'], 'tui': row['TUI'], 
                 })
    
    if index%batch_size == 0: 
        result = graph.run(statement_relations, batches=rels)
        rels = []
result = graph.run(statement_relations, batches=rels)

In [35]:
import operator as op
from functools import reduce

In [36]:
def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer / denom

In [37]:
ncr(10,3)

120.0

In [50]:
p = 0.15
n = 100
pd = 15
sump = 0
for k in range(n+1):
    sump += ncr(n, k) * pow(p, k) * pow(1-p, n-k) * pd

In [51]:
sump

14.99999999999996