In [1]:
import pandas as pd
import numpy as np
import json
import re
import os

### Choosing file type

| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2263140 | 736619 | Include international SNOMED |
| Extension | 770016 | 736619 | Some terms are only in English |
| Patient Friendly | 1437 | 1437 | Small but potentially useful list of synonyms |

The "Edition" contains also English names, so for our purpose we will focus on the "Extention" and "Patient Friendly" terms.

In [2]:
def parse_file(filename):
    with open(filename, encoding='utf-8') as f:
        entities = [[n.strip() for n in line.split('\t')] for line in f]
        return pd.DataFrame(entities[1:], columns=entities[0])

In [3]:
snomed_dir = '/Users/stan3/Data/snomed'
nl_ed_term = os.path.join(snomed_dir, 'SnomedCT_Netherlands_EditionRelease_PRODUCTION_20200930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_20200930.txt')
nl_ex_term = os.path.join(snomed_dir, 'SnomedCT_Netherlands_ExtensionRelease_PRODUCTION_20200930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_20200930.txt')
nl_pf_term = os.path.join(snomed_dir, 'SnomedCT_Netherlands_PatientFriendlyExtensionRelease_PRODUCTION_20200930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL-PatientFriendly_20200930.txt')

### Description type

|Type id|Term|
|-|-|
|900000000000003001|Fully specified name|
|900000000000013009|Synonym|
|900000000000550004|Definition|

We will use the FSN and Synonyms, as well as only dutch terms and terms that are active.

In [4]:
df_ex = parse_file(nl_ex_term)
df_ex = df_ex[(df_ex.languageCode == 'nl') & (df_ex.active == '1')]
# Inactive: 34481
# Active: 702138

df_pf = parse_file(nl_pf_term)
df_pf = df_pf[(df_pf.typeId == '900000000000013009') & (df_pf.active == '1')]
# Inactive: 149
# Active: 1284

# Extract fully specified names
df_fsn = df_ex[(df_ex.typeId == '900000000000003001')]
df_fsn.shape
# Inactive: 15211, 9
# Active: 231646, 9

df_ex_synonyms = df_ex[(df_ex.typeId == '900000000000013009')]

# Extract names and synonyms
df_synonyms = pd.concat([df_ex[(df_ex.typeId == '900000000000013009')],
                         df_pf])

### Primary concepts

In [5]:
df_primary_concepts = df_fsn.copy()
df_primary_concepts['tui'] = df_primary_concepts['term'].str.extract(r"\(([^)]*)\)[^(]*$")
df_primary_concepts['str'] = df_primary_concepts['term'].str.extract(r"(^[^\(]+)")
df_primary_concepts['str'] = df_primary_concepts['str'].str.strip()
df_primary_concepts = df_primary_concepts[['conceptId', 'typeId', 'tui', 'str']]

# Create CUI-TUI mapping
cui_tui_mapping = dict(zip(df_primary_concepts.conceptId, df_primary_concepts.tui))
df_primary_concepts.head()

Unnamed: 0,conceptId,typeId,tui,str
4,286705009,900000000000003001,bevinding,weinig neiging tot schrikken
5,364985008,900000000000003001,bevinding,bevinding betreffende geleidingspatroon van se...
9,299304009,900000000000003001,bevinding,verminderde 'range of motion' bij passieve exo...
15,278238002,900000000000003001,bevinding,focale vertraging van gemengde zenuwgeleiding
17,76369008,900000000000003001,bevinding,onaangepaste persoonlijkheidstrek


### Synonyms

In [6]:
# Clean synonym table
df_synonym_concepts = df_synonyms.copy()
df_synonym_concepts = df_synonym_concepts[['conceptId', 'term', 'typeId']]
df_synonym_concepts.rename({'term': 'str'}, inplace=True, axis=1)

# Add TUI to synonyms
df_synonym_concepts['tui'] = df_synonym_concepts.conceptId.map(cui_tui_mapping)
# df_synonym_concepts[df_synonym_concepts['tui'].isna()].shape
# 67 synonyms without type

df_synonym_concepts

Unnamed: 0,conceptId,str,typeId,tui
0,702115003,Luer-connector voor intraveneuze lijn en dosee...,900000000000013009,fysiek object
1,21426000,aandoening van membrana tympani,900000000000013009,aandoening
2,702115003,Luer-koppelstuk voor intraveneuze lijn en inje...,900000000000013009,fysiek object
3,286705009,weinig neiging tot schrikken,900000000000013009,bevinding
6,364985008,bevinding betreffende geleidingspatroon van se...,900000000000013009,bevinding
...,...,...,...,...
1432,424489006,mantouxtest,900000000000013009,verrichting
1433,239987006,haperende vinger,900000000000013009,aandoening
1434,434912009,bloedsuikerspiegel,900000000000013009,waarneembare entiteit
1435,395142003,allergietesten,900000000000013009,verrichting


### Combined

In [7]:
# Create combined 
df_all = pd.concat([df_primary_concepts, df_synonym_concepts]).reset_index(drop=True)
df_all.rename({'typeId': 'tty', 'conceptId': 'cui'}, inplace=True, axis=1)
df_all.tty.replace({'900000000000003001': 'PN',
                             '900000000000013009': 'SY'}, inplace=True)
df_all['sab'] = 'SNOMED-CT-NL'
df_all = df_all[['cui', 'str', 'tty', 'tui', 'sab']]

# Drop synonyms that are the same as primary name
print(df_all.shape)
df_all_unique = df_all.drop(df_all[(df_all.duplicated(subset=['cui', 'str', 'tui'], keep=False)) & (df_all.tty=='SY')].index)

# Sort column on cui and tty
df_all_unique['cui'] = df_all_unique['cui'].astype(int)
df_all_unique.sort_values(['cui', 'tty'], inplace=True)

df_all_unique.head()

(703422, 5)


Unnamed: 0,cui,str,tty,tui,sab
143253,103007,eekhoorn-fibroomvirus,PN,organisme,SNOMED-CT-NL
63097,104001,excisie van afwijkend weefsel van patella,PN,verrichting,SNOMED-CT-NL
361851,104001,excisie van laesie van knieschijf,SY,verrichting,SNOMED-CT-NL
18099,106004,structuur van posterieure carpale regio,PN,lichaamsstructuur,SNOMED-CT-NL
270415,106004,posterieur gebied van handwortel,SY,lichaamsstructuur,SNOMED-CT-NL


### Examples

In [8]:
df_all_unique[df_all_unique.str == 'ALS']

Unnamed: 0,cui,str,tty,tui,sab
492902,86044005,ALS,SY,aandoening,SNOMED-CT-NL


In [9]:
df_all_unique[df_all_unique.cui == '86044005']

Unnamed: 0,cui,str,tty,tui,sab


In [10]:
df_all_unique[df_all_unique.str == 'longkanker']

Unnamed: 0,cui,str,tty,tui,sab
702967,93880001,longkanker,SY,aandoening,SNOMED-CT-NL


In [11]:
df_all_unique[df_all_unique.cui == '93880001']

Unnamed: 0,cui,str,tty,tui,sab


In [12]:
df_all_unique[df_all_unique.cui == '22298006']

Unnamed: 0,cui,str,tty,tui,sab


In [13]:
df_all_unique[df_all_unique.str == 'methotrexaat']

Unnamed: 0,cui,str,tty,tui,sab
113111,387381009,methotrexaat,PN,substantie,SNOMED-CT-NL


In [14]:
df_all_unique[df_all_unique.cui == 387381009]

Unnamed: 0,cui,str,tty,tui,sab
113111,387381009,methotrexaat,PN,substantie,SNOMED-CT-NL
475744,387381009,MTX,SY,substantie,SNOMED-CT-NL


## Evaluation of SNOMED types

Find which types are present, and then manually check the performance of the types by looking at a set of cardiology letters.

In [25]:
df_all_unique.tui.value_counts()

aandoening                     152084
verrichting                    113231
lichaamsstructuur               88652
bevinding                       38894
organisme                       23197
fysiek object                   13162
substantie                      10435
afwijkende morfologie            9640
situatie                         7017
regime/therapie                  4980
gebeurtenis                      3522
monster                          3451
beroep                           3265
kwalificatiewaarde               2595
cel                              1607
omgeving                         1583
celstructuur                     1107
persoon                           679
waarneembare entiteit             572
eigenschap                        553
gegevensobject                    456
metadata                          388
fysische kracht                   285
religie/filosofie                 260
attribuut                         140
foundation metadata concept       117
product     

In [31]:
# Total number of concepts
df_all_unique.shape[0]

482163

In [33]:
# Number of primary concepts
df_all_unique[df_all_unique.tty == 'PN'].shape

(231646, 5)

In [36]:
# Number of synonyms
df_all_unique[df_all_unique.tty == 'SY'].shape

(250517, 5)

| tui | summary | good examples | bad examples |
| :- | :- | :- | :-|
|aandoening |good|hypertensie, boezemfibrilleren, av-blok| |
|monster|good|trombocyten, leukocyten,basofiele granulocyten||
|regime/therapie|good|fysiotherapie, hartrevalidatie, therapie||
|waarneembare entities|good|leeftijd, bloeddruk, hartas, LVEF||
|bevinding|good|koorts, zwelling, tachycardie||
|attribuut|good|bij, na||
|kwalificatiewaarde|good|ontslag, beloop, gestaakt, geen||
|cel|good, but rare|erythrocytes||
|gegevensobject|good, but rare| Echocardiogram, operatieverslag||
|sociaal concept|good, but rare|familie||
|situatie|good, but includes negation|geen pijn, geen dispneu, geen hoesten||
|verrichting |decent|lokale anesthesie, lichamelijk onderzoek, palpatie|erg (elektroretinografie), weken, post (peritoneale transfer van eicel en sperma), beleid (management)|
|substantie|decent|nebivolol, amlodipine, ceftriaxon|wortel, PM (fijnstof)|
|omgeving|decent|ziekenhuis, polikliniek, huis, afdeling cardiologie, afdeling fysiologie|meer, stroop, plaats|
|lichaamsstructuur|decent|pols, aortaklep, AV-knoop|mid (mesioincisodistale vlakken van gebitselement)|
|persoon|decent|patient, dochter, vader|bekende (kennis)|
|fysische kracht|decent, but rare|druk||
|fysiek object|discutable, but rare|pacemaker|verband|
|afwijkende morfologie|bad|||
|beroep|bad||rechter, belang (behanger), herkende (werkende)|
|metadata|bad||beeld|
|gebeurtenis|bad, only 1 term matched||het (hoog energetisch trauma)|

Terms that were never found: omgeving/locatie, organisme, physical object: fout, religie/filosofie, product, disorder, navigatieconcept, lifestyle, procedure, gradering, tumorgradering, beoordelingsschaal, inactief concept, speciaal concept, ras, foundation metadata concept, physical object, eigenschap, celstructuur

In [19]:
# Define list of relevant terms for general use
relevant_tuis = ['aandoening',
                'monster',
                'regime/therapie',
                'bevinding',
                'cel',
                'gegevensobject',
                'verrichting',
                'substantie',
                'lichaamsstructuur']

In [27]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].tui.value_counts()

aandoening           152084
verrichting          113231
lichaamsstructuur     88652
bevinding             38894
substantie            10435
regime/therapie        4980
monster                3451
cel                    1607
gegevensobject          456
Name: tui, dtype: int64

In [28]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].shape[0]

413790

## Output

In [20]:
# Save to file
df_all_unique.to_csv('snomedct-dutch_v1.0-complete.csv', index=False)
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].to_csv('snomedct-dutch_v1.0.csv', index=False)