In [19]:
# !conda install sqlalchemy --y

In [1]:
import pandas as pd
import sqlalchemy as sql

In [2]:
connect_string = 'mysql://root:admin123@localhost/umls'

In [3]:
sql_engine = sql.create_engine(connect_string)

## Concept

In [70]:
# query = "SELECT CUI, TTY, SUI, STR, AUI, SCUI FROM MRCONSO WHERE SUPPRESS='N' AND LAT = 'ENG' AND TS = 'P' AND ISPREF = 'Y' AND SAB ='SNOMEDCT_US'"

# query = """
# SELECT MRCONSO.CUI, TTY, SUI, STR, AUI, TUI, STN, STY FROM MRCONSO 
# left join MRSTY on MRCONSO.CUI = MRSTY.CUI 
# WHERE SUPPRESS='N' AND LAT = 'ENG' AND TS = 'P' AND ISPREF = 'Y' AND SAB ='SNOMEDCT_US'
# """
# CUI, TTY, SUI, STR, AUI, SCUI, SAB
query = "SELECT * FROM MRCONSO WHERE SAB ='SNOMEDCT_US' and TTY='FN'"


df = pd.read_sql_query(query, sql_engine)

In [73]:
df.sample(3)

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
220020,C4551686,ENG,S,L3022358,PF,S3404127,Y,A3550159,662333013,269469005,,SNOMEDCT_US,FN,269469005,Malignant neoplasm of soft tissue (disorder),9,N,
161290,C1288862,ENG,S,L2856795,PF,S3279500,Y,A3417639,831970015,368355004,,SNOMEDCT_US,FN,368355004,"Entire articular surface, interphalangeal, of ...",9,N,
39009,C0273610,ENG,S,L2781134,PF,S14014361,Y,A22998021,2959549016,84278006,,SNOMEDCT_US,FN,84278006,Abrasion and/or friction burn of neck with inf...,9,N,


In [74]:
df['SUPPRESS'].value_counts()

N    346950
Name: SUPPRESS, dtype: int64

In [75]:
df.shape

(346950, 18)

In [76]:
df.to_csv('data/concepts.csv', index=None)

In [77]:
con = pd.read_csv('data/concepts.csv')

In [78]:
con = con.set_index('CUI')

In [79]:
con = con.sort_index()

In [80]:
con.shape

(346950, 17)

In [81]:
con['STT'].value_counts()

PF     346415
VO        456
VC         53
VCW        16
VW         10
Name: STT, dtype: int64

## Some concepts have multiple strings

In [82]:
con.loc['C1635169']

LAT                                             ENG
TS                                                S
LUI                                        L5854420
STT                                              PF
SUI                                        S6690479
ISPREF                                            Y
AUI                                        A9410183
SAUI                                     2574432014
SCUI                                      418342002
SDUI                                            NaN
SAB                                     SNOMEDCT_US
TTY                                              FN
CODE                                      418342002
STR         Microliter/milliliter (qualifier value)
SRL                                               9
SUPPRESS                                          N
CVF                                             NaN
Name: C1635169, dtype: object

In [83]:
con.index

Index(['C0000052', 'C0000097', 'C0000102', 'C0000163', 'C0000167', 'C0000172',
       'C0000215', 'C0000220', 'C0000246', 'C0000248',
       ...
       'C4721329', 'C4721330', 'C4721331', 'C4721332', 'C4721333', 'C4721334',
       'C4721335', 'C4721336', 'C4721337', 'C4721338'],
      dtype='object', name='CUI', length=346950)

In [84]:
# con.duplicated()

## Relation 

### All reltionships

In [None]:
# query = "select CUI1, AUI1, REL, CUI2, AUI2, RELA, RG from umls.MRREL where SAB ='SNOMEDCT_US' AND DIR='Y' AND SUPPRESS='N'"
query = "select CUI1, AUI1, REL, CUI2, AUI2, RELA, RG, SUPPRESS from umls.MRREL where SAB ='SNOMEDCT_US' AND DIR='Y' "

df = pd.read_sql_query(query, sql_engine)

In [None]:
# query = "SELECT * FROM umls.SRDEF WHERE rt = 'RL'"
# rel_type = pd.read_sql_query(query, sql_engine)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['SUPPRESS'].value_counts()

## Remove duplicated relations

In [67]:
df = df.sort_values(by=['CUI1', 'CUI2'])

In [68]:
df.head()

Unnamed: 0,CUI1,AUI1,REL,CUI2,AUI2,RELA,RG,SUPPRESS
353112,C0000039,A22817493,RO,C0523614,A3120482,has_component,0,O
670860,C0000039,A22817493,RO,C0523614,A3120482,has_measured_component,0,O
1110815,C0000052,A27769867,RO,C0523417,A3087961,has_measured_component,0,O
1004877,C0000052,A27769867,RO,C0523417,A3087961,has_component,0,N
329867,C0000097,A3230610,RO,C0270730,A2972013,has_causative_agent,0,N


In [None]:
df[df.duplicated(['CUI1', 'CUI2', 'RELA'])]

In [None]:
df = df.drop_duplicates(['CUI1', 'CUI2', 'RELA'])

In [None]:
df.shape

In [None]:
df.to_csv('data/relations.csv', index=None)

In [42]:
rel = pd.read_csv('data/relations.csv')

In [43]:
rel.shape

(2239172, 7)

In [None]:
rel = rel.set_index(['CUI1', 'CUI2'])

In [None]:
rel = rel.sort_index()

In [None]:
rel.loc[('C0348025', 'C3697449')]

In [None]:
rel.loc[rel.duplicated()==True]

In [None]:
rel['RELA'].value_counts()

## Semantic Type

In [None]:
# query = """
# select * from MRSTY where CUI in 
# (select distinct CUI from MRCONSO WHERE SUPPRESS='N' AND LAT = 'ENG' 
# AND TS = 'P' AND ISPREF = 'Y' AND SAB ='SNOMEDCT_US')
# """

query = """
select * from MRSTY where CUI in 
(select distinct CUI from MRCONSO WHERE SAB ='SNOMEDCT_US')
"""


df = pd.read_sql_query(query, sql_engine)

In [None]:
df.head()

In [None]:
df = df[['CUI', 'TUI', 'STN', 'STY']]

In [None]:
df.head()

In [None]:
df.to_csv('data/semantype.csv', index=None)