In [None]:
!pip install owlready2==0.44

In [None]:
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *

In [None]:
default_world.set_backend(filename='umls.sqlite3')
import_umls('<UMLS METATHESAURUS>.zip', terminologies=['CUI', 'SNOMEDCT_US', 'LNC',
    'ICD10PCS', 'RXNORM', 'MTHSPL', 'ATC', 'HCPCS'])
default_world.save()

In [None]:
PYM = get_ontology("http://PYM/").load()

#### Mapeamento SIGTAP-OMOP

In [None]:
import pandas as pd

In [None]:
sigtap_omop_df = pd.read_csv('<SIGTAP OMOP ANNOTATIONS>.csv',
                 dtype={'sourceCode': str, 'statusSetOn': str, 'conceptId': str, 'createdOn': str})

In [None]:
sigtap_omop_df

#### SIGTAP

In [None]:
import re

In [None]:
data = []
with open('<SIGTAP Table Names>.txt', 'r', encoding='latin-1') as f:
    for line in f:
        line = line.split('                    ')[0]
        m = re.match(r'^(\d+)(.+)', line, re.UNICODE)
        if m:
            id_part, name_part = m.groups()
            data.append((id_part, name_part.strip()))

In [None]:
sigtap_procs_df = pd.DataFrame(data, columns=['ID', 'Name'])
sigtap_procs_df

In [None]:
data = []
with open('<SIGTAP Table Descriptions>.txt', 'r', encoding='latin-1') as f:
    for line in f:
        line = line.split('                              ')[0]
        m = re.match(r'^(\d+)(.+)', line, re.UNICODE)
        if m:
            id_part, name_part = m.groups()
            data.append((id_part, name_part.strip()))

In [None]:
sigtap_desc_df = pd.DataFrame(data, columns=['ID', 'Name'])
sigtap_desc_df

In [None]:
merged_df = sigtap_procs_df.merge(sigtap_desc_df, how='left', on='ID')

In [None]:
merged_df

In [None]:
merged_df.to_csv('sigtap_procedimentos_descricoes.csv', sep='\t', index=False)

#### OMOP vocabularies

In [None]:
omop_df = pd.read_csv('<OMOP Athena vocabularies concepts>.csv', sep='\t', dtype={'concept_id': str, 'concept_code': str})
omop_df

In [None]:
sigtap_merged = sigtap_omop_df.merge(omop_df, how='left', left_on='conceptId', right_on='concept_id')

In [None]:
annotated_grouped = sigtap_merged[sigtap_merged.mappingStatus == 'APPROVED'].groupby('sourceCode')['vocabulary_id'].agg(list).reset_index()

In [None]:
annotated_grouped[annotated_grouped['vocabulary_id'].apply(lambda x: not any(element in x for element in ['SNOMED', 'RxNorm', 'RxNorm Extension']))]

In [None]:
sigtap_merged.vocabulary_id.unique()

In [None]:
omop_df.vocabulary_id.unique()

In [None]:
OMOP_UMLS_MAPPING = {'CUI': 'CUI', 'SNOMED': 'SNOMEDCT_US', 'LOINC': 'LNC',
    'ICD10PCS': 'ICD10PCS', 'RxNorm': 'RXNORM', 'SPL': 'MTHSPL', 'ATC': 'ATC', 'HCPCS': 'HCPCS'}

In [None]:
def query_cui_code(row):
    cui_id = None
    umls_vocabulary = OMOP_UMLS_MAPPING.get(row.vocabulary_id)
    if umls_vocabulary:
        concept = PYM[umls_vocabulary][row.concept_code]
        if concept:
            cui_ids = concept.unifieds
            if cui_ids:
                cui_id = cui_ids[0].name
    row['CUI'] = cui_id
    return row

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
omop_filter = omop_df['vocabulary_id'].isin(['SNOMED', 'LOINC',
    'ICD10PCS', 'RxNorm', 'SPL', 'ATC', 'HCPCS'])

In [None]:
omop_df.loc[omop_filter, 'CUI'] = omop_df.loc[omop_filter].progress_apply(query_cui_code, axis=1)

In [None]:
omop_df.to_csv('omop_with_cui_code.csv', sep='\t', index=False)

#### Definition extraction

In [None]:
umls_defs = pd.read_csv('<UMLS MRDEF.RRF>', sep='|', names=['CUI', 'AUI', 'ATUI', 'SATUI', 'SAB', 'DEF', 'SUPPRESS', 'CVF', 'Other'])
umls_defs

In [None]:
non_english = ['MSHPOR', 'MSHSPA', 'MSHCZE', 'MSHSWE', 'SCTSPA', 'MSHNOR', 'MDRKOR', 'MDRSPA', 'MDRDUT', 'MDRRUS', 'MDRSWE', 'MDRBPO', 'MDRLAV', 'MDRJPN', 'MDRGRE', 'MDRPOL',
    'MDRHUN', 'MDRGER', 'MDRCZE', 'MDRPOR', 'MDRITA', 'MDRFRE', 'MDRARA', 'MSHFRE', 'MSHSCR']

In [None]:
definitions_umls = umls_defs[~umls_defs.SAB.isin(non_english)].groupby('CUI')['DEF'].apply(lambda x: ' | '.join(x)).reset_index()
definitions_umls

#### OMOP concepts with definitions

In [None]:
omop_cui = pd.read_csv('omop_with_cui_code.csv', sep='\t', dtype={'concept_id': str, 'concept_code': str})

In [None]:
omop_cui_definitions = omop_cui.merge(definitions_umls, how='left', left_on='CUI', right_on='CUI')
omop_cui_definitions

In [None]:
omop_cui_definitions[~omop_cui_definitions.DEF.isna()]

In [None]:
omop_cui_definitions.to_csv('omop_cui_definitions.csv', sep='\t', index=False)

#### OMOP concepts - CUI, definitions and synonyms

In [None]:
omop_synonym_df = pd.read_csv('<OMOP Athena concepts synonyms>.csv', sep='\t',
                              dtype={'concept_id': str, 'concept_synonym_name': str, 'language_concept_id': str})
# Language code EN = 4180186
omop_synonym_df = omop_synonym_df[omop_synonym_df.language_concept_id == '4180186']
omop_synonym_df['concept_synonym_name'] = omop_synonym_df['concept_synonym_name'].fillna('')
omop_synonym_df

In [None]:
# Testing for suitable separator
omop_synonym_df[omop_synonym_df['concept_synonym_name'].str.contains('£', regex=False)]

In [None]:
# Grouping synonyms by concept ID
omop_synonyms = omop_synonym_df.groupby('concept_id')['concept_synonym_name'].apply(lambda x: ' £ '.join(x)).reset_index()
omop_synonyms

In [None]:
omop_cui_definitions = pd.read_csv('omop_cui_definitions.csv', sep='\t', dtype={'concept_id': str, 'concept_code': str})
omop_cui_definitions[~omop_cui_definitions.DEF.isna()]

In [None]:
omop_cui_defs_syns = omop_cui_definitions.merge(omop_synonyms, how='left', left_on='concept_id', right_on='concept_id')
omop_cui_defs_syns

In [None]:
omop_cui_defs_syns.to_csv('omop_cui_defs_syns.csv', sep='\t', index=False)