# Exploring a SNOMED-CT Release

In [94]:
import pandas as pd
import numpy as np
import pickle

In [95]:
snomed_dir = '/Users/shek/Desktop/medcat/SNOMED UK'

In [96]:
# Use Snapshot, instead of Full, here, as Full contains all historical concepts since 2014. Delta only contains differences from last version.
# https://confluence.ihtsdotools.org/display/DOCGLOSS/Snapshot+release

In [97]:
base_term = f'{snomed_dir}/uk_sct2cl_28.0.0_20191001000001/'
int_terminology = base_term + 'SnomedCT_InternationalRF2_PRODUCTION_20180731T120000Z/Snapshot/Terminology'
uk_ext_terminology = base_term + 'SnomedCT_UKClinicalRF2_PRODUCTION_20191001T000001Z/Snapshot/Terminology'

In [98]:
def parse_file(filename, first_row_header=True, columns=None):
    with open(filename, encoding='utf-8') as f:
        entities = [[n.strip() for n in line.split('\t')] for line in f]
        return pd.DataFrame(entities[1:], columns=entities[0] if first_row_header else columns)

In [99]:
# SNOMED-CT (UK Ed.) is an extension to the Int Ed. Both sets of files (Int. and the UK Ext.) are released as part of one 'UK Release'.

In [100]:
int_terms = parse_file(f'{int_terminology}/sct2_Concept_Snapshot_INT_20180731.txt')
uk_terms = parse_file(f'{uk_ext_terminology}/sct2_Concept_Snapshot_GB1000000_20191001.txt')
terms = pd.concat([int_terms, uk_terms])
active_terms = terms[terms.active == '1']

In [109]:
active_terms[active_terms['id'] == '199451000000106']

Unnamed: 0,id,effectiveTime,active,moduleId,definitionStatusId
16798,199451000000106,20060401,1,999000011000000103,900000000000074008


In [93]:
len(active_terms)

369742

In [79]:
int_desc = parse_file(f'{int_terminology}/sct2_Description_Snapshot-en_INT_20180731.txt')
uk_desc = parse_file(f'{uk_ext_terminology}/sct2_Description_Snapshot-en_GB1000000_20191001.txt')
descs = pd.concat([int_desc, uk_desc])
active_descs = descs[descs.active == '1']

In [86]:
active_descs.head()

Unnamed: 0,id,effectiveTime,active,moduleId,conceptId,languageCode,typeId,term,caseSignificanceId
0,101013,20170731,1,900000000000207008,126813005,en,900000000000013009,Neoplasm of anterior aspect of epiglottis,900000000000448009
1,102018,20170731,1,900000000000207008,126814004,en,900000000000013009,Neoplasm of junctional region of epiglottis,900000000000448009
2,103011,20170731,1,900000000000207008,126815003,en,900000000000013009,Neoplasm of lateral wall of oropharynx,900000000000448009
3,104017,20170731,1,900000000000207008,126816002,en,900000000000013009,Neoplasm of posterior wall of oropharynx,900000000000448009
4,105016,20170731,1,900000000000207008,126817006,en,900000000000013009,Neoplasm of esophagus,900000000000448009


In [92]:
len(active_descs['typeId']== '900000000000003001')

1385796

In [106]:
active_descs[active_descs['term'].str.match('Simple partial epileptic seizure')]

Unnamed: 0,id,effectiveTime,active,moduleId,conceptId,languageCode,typeId,term,caseSignificanceId
150502,301531000000113,20060401,1,999000011000000103,199451000000106,en,900000000000003001,Simple partial epileptic seizure (disorder),900000000000020002
150503,301541000000116,20060401,1,999000011000000103,199451000000106,en,900000000000013009,Simple partial epileptic seizure,900000000000020002
153705,330381000000118,20060401,1,999000011000000103,215071000000106,en,900000000000003001,Simple partial epileptic seizure,900000000000020002
155232,344111000000116,20060401,1,999000011000000103,215071000000106,en,900000000000013009,Simple partial epileptic seizure,900000000000020002


In [25]:
concepts_df = pd.concat(list(rows for cui, rows in active_descs[active_descs.conceptId.isin(active_terms.id)].groupby('conceptId')))

In [26]:
concepts_df[concepts_df.conceptId == '10061007']

Unnamed: 0,id,effectiveTime,active,moduleId,conceptId,languageCode,typeId,term,caseSignificanceId
16825,17560015,20020131,1,900000000000207008,10061007,en,900000000000013009,Brain stem contusion without open intracranial...,900000000000020002
503876,520861019,20020131,1,900000000000207008,10061007,en,900000000000003001,Brain stem contusion without open intracranial...,900000000000020002


In [81]:
concepts_df['tty'] = concepts_df['typeId'].apply(lambda r: 'PN' if r == '900000000000003001' else np.nan)

In [82]:
snomed_cdb_csv = concepts_df.loc[:, ['conceptId', 'term', 'tty']]
snomed_cdb_csv['conceptId'] = snomed_cdb_csv.conceptId.apply(lambda code: f'S-{code}')
snomed_cdb_csv.columns = ['cui', 'str', 'tty']
snomed_cdb_csv['onto'] = 'SNOMED-CT'

In [90]:
snomed_cdb_csv[snomed_cdb_csv['cui']=='S-64572001']

Unnamed: 0,cui,str,tty,onto
169358,S-64572001,Disease,,SNOMED-CT
179647,S-64572001,Clinical disease AND/OR syndrome,,SNOMED-CT
179648,S-64572001,Disorder,,SNOMED-CT
179649,S-64572001,Syndrome,,SNOMED-CT
179652,S-64572001,Disease AND/OR syndrome present,,SNOMED-CT
179653,S-64572001,Clinical disease AND/OR syndrome present,,SNOMED-CT
483023,S-64572001,Disorders,,SNOMED-CT
483024,S-64572001,Diseases,,SNOMED-CT
784534,S-64572001,Disease (disorder),PN,SNOMED-CT


In [30]:
snomed_cdb_csv.to_csv('snomed_cdb_csv_SNOMED-CT-UK_Release_20191001.csv')

In [354]:
snomed_cdb_csv.shape

(963529, 4)

In [350]:
# Example concept that is 'sourced' from the Int. version - Should match with NHS - SNOMED-CT Browser.
snomed_cdb_csv[snomed_cdb_csv.cui == 'S-10061007']

Unnamed: 0,cui,str,tty,onto
16825,S-10061007,Brain stem contusion without open intracranial...,,SNOMED-CT
503876,S-10061007,Brain stem contusion without open intracranial...,PN,SNOMED-CT


In [38]:
# Exkey_to_delle concept that is 'sourced' from the UK Ed. - Should match with NHS - SNOMED-CT Browser.
snomed_cdb_csv[snomed_cdb_csv.cui == 'S-865371000000104']

Unnamed: 0,cui,str,tty,onto
94365,S-865371000000104,No history of epilepsy (situation),PN,SNOMED-CT
94366,S-865371000000104,No history of epilepsy,,SNOMED-CT
94837,S-865371000000104,No H/O epilepsy,,SNOMED-CT


### Links of SNOMED-CT (UK Edition with the International version), then link back to UMLS for the synonyms

In [None]:
# Should be able to use previous approach to link SNOMED (Int only) to UMLS terms for the series of synonyms.

### ICD-10 / OPCS-4 linkages with SNOMED-CT

In [205]:
refset_terminology = f'{base_term}/SnomedCT_UKClinicalRF2_PRODUCTION_20191001T000001Z/Snapshot/Refset/Map'

In [206]:
mappings = parse_file(f'{refset_terminology}/der2_iisssciRefset_ExtendedMapSnapshot_GB1000000_20191001.txt')
mappings = mappings[mappings.active == '1']
mappings.referencedComponentId = mappings.referencedComponentId.apply(lambda s: f'S-{s}')

In [207]:
mappings.mapPriority = mappings.mapPriority.astype(int)

In [212]:
icd10_refset_id = '999002271000000101'
opcs4_refset_id = '999002741000000101'

In [215]:
%%time
cui2mappings = dict()
for cui in snomed_cdb_csv.cui.unique():
    cui_map = mappings[mappings.referencedComponentId == cui].loc[:, ['mapPriority', 'mapAdvice', 'mapTarget', 'refsetId']]
    if cui_map.shape[0] > 0:
        cui2mappings[cui] = cui_map.sort_values('mapPriority')

CPU times: user 5h 7min 53s, sys: 1min 42s, total: 5h 9min 35s
Wall time: 5h 10min 30s


In [None]:
opcs_mappings = {}
icd10_mappings= {}
for cui, mappings in cui2mappings.items():
    icd10_codes = mappings[mappings.refsetId == icd10_refset_id]
    if icd10_codes.shape[0] > 0:
        icd10_mappings[cui] = icd10_codes
    opcs_codes = mappings[mappings.refsetId == opcs4_refset_id]
    if opcs_codes.shape[0] > 0:
        opcs_mappings[cui] = opcs_codes

In [260]:
pickle.dump(opcs_mappings, open('opcs_mappings_full.pickle', 'wb'))
pickle.dump(icd10_mappings, open('icd10_mappings_full.pickle', 'wb'))

In [259]:
# condense mappings to a simple dict representation

In [272]:
def condense_mapping(cui2mappings):
    mapping_condensed = {}
    for cui, mappings in cui2mappings.items():
        mapping_condensed[cui] = mappings.mapTarget.replace('(\w\d\d)(\d*)', r'\1.\2', regex=True).tolist()
    return mapping_condensed

In [314]:
icd10_mapping_condensed = condense_mapping(icd10_mappings)

In [315]:
opcs_mapping_condensed = condense_mapping(opcs_mappings)

In [319]:
pickle.dump(icd10_mapping_condensed, open('icd10_mapping_condensed.pickle', 'wb'))
pickle.dump(opcs_mapping_condensed, open('opcs_mapping_condensed.pickle', 'wb'))

### Exploring ICD-10 Release
From NHS Digital: https://isd.digital.nhs.uk/trud3/user/authenticated/group/0/pack/1/subpack/258/releases

In [248]:
icd10_mapping_filename = '/Users/tom/phd/ICD10_Edition5_20160401/Content/ICD10_Edition5_CodesAndTitlesAndMetadata_GB_20160401.txt'
icd10_mapping_detail = parse_file(icd10_mapping_filename)
icd10_mapping_detail['full_description'] = icd10_mapping_detail.DESCRIPTION +  icd10_mapping_detail.MODIFIER_4 + icd10_mapping_detail.MODIFIER_5

In [246]:
icd10_mapping_detail.USAGE_UK.value_counts()

3    12604
0     4404
2      816
1       73
4       37
Name: USAGE_UK, dtype: int64

In [247]:
icd10_mapping_detail.USAGE.value_counts()

DEFAULT     17008
ASTERISK      816
DAGGER        110
Name: USAGE, dtype: int64

- 1: absolute dagger, with asterisk(s) in title
- 2: absolute asterisk
- 3: other (non dagger/asterisk, inclusion dagger or inclusion asterisk)
- 4: (absolute) dagger, without asterisk(s) in title See 15.1.2
- 0: not applicable (previously expressed as space in ICD-10 Metadata Specification v3.1)

In [254]:
icd10_uk_codes = {c: desc for c, desc in zip(icd10_mapping_detail.ALT_CODE, icd10_mapping_detail.full_description)}

In [321]:
pickle.dump(icd10_uk_codes, open('icd10_uk_codes.pickle', 'wb'))

### Exploring OPCS-4 Mapping Release

In [334]:
opcs_filename = '/Users/tom/phd/nhs_opcs4df_8.0.0_20161101000001/OPCS48 CodesAndTitles Nov 2016 V1.0.txt'
opcs_desc_df = parse_file(opcs_filename, first_row_header=False, columns=['code', 'desc'])

In [346]:
opcs_desc_df = {code: desc for code, desc in zip(opcs_desc_df.code, opcs_desc_df.desc)}

In [347]:
pickle.dump(opcs_desc_df, open('opcs_codes_desc.pickle', 'wb'))