In [1]:
import pandas as pd
import numpy as np
import json
import re
import os


## Sources files

To count total number of records in the SNOMED files, use `wc -l` and substract 1 for header.
```bash
wc -l SnomedCT_Netherlands_*_PRODUCTION_20210930T120000Z/Snapshot/Terminology/sct2_Description_*.txt
```

To count number of NL records in the SNOMED files, use `grep` and `wc -l`.
```bash
grep "\tnl\t" SnomedCT_Netherlands_EditionRelease_PRODUCTION_20210930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_20210930.txt | wc -l
```

### SNOMED September 2020 release
| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2263140 | 736619 | Include international SNOMED |
| Extension | 770016 | 736619 | Some terms are only in English |
| Patient Friendly | 1437 | 1437 | Small but potentially useful list of synonyms |

### SNOMED March 2021 release

| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2422738 | 880806 | Include international SNOMED |
| Extension | 916553 | 880806 | Some terms are only in English |
| Patient Friendly | 2004 | 2004 | Small but potentially useful list of synonyms |

### SNOMED September 2021 release

| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2469845 | 910228 | Include international SNOMED |
| Extension | 948571 | 910228 | Some terms are only in English |
| Patient Friendly | 2385 | 2385 | Small but potentially useful list of synonyms |

The "Edition" contains many English names, so for our Dutch concept table we will use the "Extention" and "Patient Friendly" terms.

In [2]:
def parse_file(filename):
    with open(filename, encoding='utf-8') as f:
        entities = [[n.strip() for n in line.split('\t')] for line in f]
        return pd.DataFrame(entities[1:], columns=entities[0])

In [3]:
snomed_dir = '/Users/stan3/Data/snomed'
release_date = '20210930'
nl_ed_term = os.path.join(snomed_dir, f'SnomedCT_Netherlands_EditionRelease_PRODUCTION_{release_date}T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_{release_date}.txt')
nl_ex_term = os.path.join(snomed_dir, f'SnomedCT_Netherlands_ExtensionRelease_PRODUCTION_{release_date}T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_{release_date}.txt')
nl_pf_term = os.path.join(snomed_dir, f'SnomedCT_Netherlands_PatientFriendlyExtensionRelease_PRODUCTION_{release_date}T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL-PatientFriendly_{release_date}.txt')

### Description type
The SNOMED description table contains 3 types:

|Type id|Term|
|-|-|
|900000000000003001|Fully specified name (FSN)|
|900000000000013009|Synonym|
|900000000000550004|Definition|

The the purpose of creating a list of names for entity recognation, terms must be:
- FSN or Synonym
- Active
- Dutch

In [4]:
# Select active terms from Extention release
df_ex = parse_file(nl_ex_term)
df_ex_active = df_ex[(df_ex.languageCode == 'nl') & (df_ex.active == '1')].copy()
df_ex_inactive = df_ex[(df_ex.languageCode == 'nl') & (df_ex.active != '1')]
print(f'Extention Release Active: {df_ex_active.shape[0]}')
print(f'Extention Release Inactive: {df_ex_inactive.shape[0]}\n')

# Extract fully specified names
df_fsn = df_ex_active[(df_ex_active.typeId == '900000000000003001')]
print(f'Extention Release Active FSN: {df_fsn.shape[0]}')

# Extract synonyms
df_ex_synonyms = df_ex_active[(df_ex_active.typeId == '900000000000013009')].copy()
print(f'Extention Release Active Synonyms: {df_ex_synonyms.shape[0]}\n')

# Select active terms from patient friendly extention release
df_pf = parse_file(nl_pf_term)
df_pf_active = df_pf[(df_pf.typeId == '900000000000013009') & (df_pf.active == '1')].copy()
df_pf_inactive = df_pf[(df_pf.typeId == '900000000000013009') & (df_pf.active == '0')]
print(f'Patient Friendly Extention release Active: {df_pf_active.shape[0]}')
print(f'Patient Friendly Extention release Inactive: {df_pf_inactive.shape[0]}\n')

# Merge synonyms and patient friendly terms as synonyms in our Concept table
df_synonyms = pd.concat([df_ex_synonyms,
                         df_pf_active])
print(f'Total synonyms: (Extention Release synonyms & Patient Friendly): {df_synonyms.shape[0]}')

Extention Release Active: 848195
Extention Release Inactive: 62033

Extention Release Active FSN: 274887
Extention Release Active Synonyms: 573308

Patient Friendly Extention release Active: 2168
Patient Friendly Extention release Inactive: 210

Total synonyms: (Extention Release synonyms & Patient Friendly): 575476


### Primary concepts

In [5]:
df_primary_concepts = df_fsn.copy()
df_primary_concepts['tui'] = df_primary_concepts['term'].str.extract(r"\(([^)]*)\)[^(]*$")
df_primary_concepts['str'] = df_primary_concepts['term'].str.extract(r"(^[^\(]+)")
df_primary_concepts['str'] = df_primary_concepts['str'].str.strip()
df_primary_concepts = df_primary_concepts[['conceptId', 'typeId', 'tui', 'str']]

# Create CUI-TUI mapping
cui_tui_mapping = dict(zip(df_primary_concepts.conceptId, df_primary_concepts.tui))
df_primary_concepts.head()

Unnamed: 0,conceptId,typeId,tui,str
4,697920006,900000000000003001,aandoening,pulmonale hypertensie bij systeemziekte
7,286705009,900000000000003001,bevinding,weinig neiging tot schrikken
8,364985008,900000000000003001,bevinding,bevinding betreffende geleidingspatroon van se...
12,299304009,900000000000003001,bevinding,verminderde 'range of motion' bij passieve exo...
18,278238002,900000000000003001,bevinding,focale vertraging van gemengde zenuwgeleiding


### Synonyms

In [6]:
# Clean synonym table
df_synonym_concepts = df_synonyms.copy()
df_synonym_concepts = df_synonym_concepts[['conceptId', 'term', 'typeId']]
df_synonym_concepts.rename({'term': 'str'}, inplace=True, axis=1)

# Add TUI to synonyms
df_synonym_concepts['tui'] = df_synonym_concepts.conceptId.map(cui_tui_mapping)
# df_synonym_concepts[df_synonym_concepts['tui'].isna()].shape
# 67 synonyms without type

df_synonym_concepts

Unnamed: 0,conceptId,str,typeId,tui
0,702115003,Luer-connector voor intraveneuze lijn en dosee...,900000000000013009,fysiek object
1,21426000,aandoening van membrana tympani,900000000000013009,aandoening
2,702115003,Luer-koppelstuk voor intraveneuze lijn en inje...,900000000000013009,fysiek object
3,697920006,pulmonale hypertensie bij systeemziekte,900000000000013009,aandoening
5,697920006,pulmonale hypertensie bij systemische ziekte,900000000000013009,aandoening
...,...,...,...,...
2380,424489006,mantouxtest,900000000000013009,verrichting
2381,239987006,haperende vinger,900000000000013009,aandoening
2382,434912009,bloedsuikerspiegel,900000000000013009,waarneembare entiteit
2383,395142003,allergietesten,900000000000013009,verrichting


### Combined

In [7]:
# Create combined 
df_all = pd.concat([df_primary_concepts, df_synonym_concepts]).reset_index(drop=True)
df_all.rename({'typeId': 'tty', 'conceptId': 'cui'}, inplace=True, axis=1)

# Map to MedCAT's P (Preferred term) & A values
# See https://github.com/CogStack/MedCAT/blob/master/examples/README.md
df_all.tty.replace({'900000000000003001': 'P',
                    '900000000000013009': 'A'}, inplace=True)

# Use convention in UMLS where default English SNOMED is called SNOMEDCT_US
df_all['sab'] = 'SNOMEDCT_NL'
df_all = df_all[['cui', 'str', 'tty', 'tui', 'sab']]

# Drop synonyms that are the same as primary name
print(f'Records before dropping duplicates: {df_all.shape[0]}')
df_all_unique = df_all.drop(df_all[(df_all.duplicated(subset=['cui', 'str', 'tui'], keep=False)) & (df_all.tty=='A')].index)
print(f'Records after dropping duplicates: {df_all_unique.shape[0]}')

# Sort column on cui and tty
df_all_unique['cui'] = df_all_unique['cui'].astype(int)
df_all_unique.sort_values(['cui', 'tty'], inplace=True)

df_all_unique.head()

Records before dropping duplicates: 850363
Records after dropping duplicates: 586523


Unnamed: 0,cui,str,tty,tui,sab
186978,103007,eekhoorn-fibroomvirus,P,organisme,SNOMEDCT_NL
510540,104001,excisie van laesie van knieschijf,A,verrichting,SNOMEDCT_NL
110508,104001,excisie van afwijkend weefsel van patella,P,verrichting,SNOMEDCT_NL
390784,106004,posterieur gebied van handwortel,A,lichaamsstructuur,SNOMEDCT_NL
390786,106004,posterieur carpaal gebied,A,lichaamsstructuur,SNOMEDCT_NL


### Examples

In [8]:
df_all_unique[df_all_unique.str == 'ALS']

Unnamed: 0,cui,str,tty,tui,sab
637835,86044005,ALS,A,aandoening,SNOMEDCT_NL


In [9]:
df_all_unique[df_all_unique.cui == 86044005]

Unnamed: 0,cui,str,tty,tui,sab
637835,86044005,ALS,A,aandoening,SNOMEDCT_NL
261252,86044005,amyotrofische laterale sclerose,P,aandoening,SNOMEDCT_NL


In [10]:
df_all_unique[df_all_unique.str == 'longkanker']

Unnamed: 0,cui,str,tty,tui,sab
849907,93880001,longkanker,A,aandoening,SNOMEDCT_NL


In [11]:
df_all_unique[df_all_unique.cui == 93880001]

Unnamed: 0,cui,str,tty,tui,sab
849907,93880001,longkanker,A,aandoening,SNOMEDCT_NL
76992,93880001,primair maligne neoplasma van long,P,aandoening,SNOMEDCT_NL


In [12]:
df_all_unique[df_all_unique.cui == 22298006]

Unnamed: 0,cui,str,tty,tui,sab
651747,22298006,hartinfarct,A,aandoening,SNOMEDCT_NL
651757,22298006,MI,A,aandoening,SNOMEDCT_NL
651763,22298006,hartaanval,A,aandoening,SNOMEDCT_NL
172002,22298006,myocardinfarct,P,aandoening,SNOMEDCT_NL


In [13]:
df_all_unique[df_all_unique.str == 'methotrexaat']

Unnamed: 0,cui,str,tty,tui,sab
157795,387381009,methotrexaat,P,substantie,SNOMEDCT_NL


In [14]:
df_all_unique[df_all_unique.cui == 387381009]

Unnamed: 0,cui,str,tty,tui,sab
621163,387381009,MTX,A,substantie,SNOMEDCT_NL
157795,387381009,methotrexaat,P,substantie,SNOMEDCT_NL


## Evaluation of SNOMED types

Find which types are present, and then manually check the performance of the types by looking at a set of cardiology letters.

In [15]:
df_all_unique.tui.value_counts()

aandoening                       185873
verrichting                      120781
lichaamsstructuur                100172
bevinding                         67630
organisme                         24325
fysiek object                     20664
situatie                          12096
substantie                        10734
afwijkende morfologie             10231
waarneembare entiteit              6126
regime/therapie                    5298
monster                            3917
gebeurtenis                        3603
beroep                             3454
kwalificatiewaarde                 3015
cel                                1624
omgeving                           1622
celstructuur                       1118
farmaceutisch product               956
persoon                             686
eigenschap                          558
gegevensobject                      473
metadata                            402
fysische kracht                     287
religie/filosofie                   260


In [16]:
# Total number of concepts
df_all_unique.shape[0]

586523

In [17]:
# Number of primary concepts
df_all_unique[df_all_unique.tty == 'PN'].shape

(0, 5)

In [18]:
# Number of synonyms
df_all_unique[df_all_unique.tty == 'SY'].shape

(0, 5)

| tui | summary | good examples | bad examples |
| :- | :- | :- | :-|
|aandoening |good|hypertensie, boezemfibrilleren, av-blok| |
|monster|good|trombocyten, leukocyten,basofiele granulocyten||
|regime/therapie|good|fysiotherapie, hartrevalidatie, therapie||
|waarneembare entities|good|leeftijd, bloeddruk, hartas, LVEF||
|bevinding|good|koorts, zwelling, tachycardie||
|attribuut|good|bij, na||
|kwalificatiewaarde|good|ontslag, beloop, gestaakt, geen||
|afwijkende morfologie|good|thermisch letsel, blaar, luxatie||
|cel|good, but rare|erythrocytes||
|gegevensobject|good, but rare| Echocardiogram, operatieverslag||
|sociaal concept|good, but rare|familie||
|situatie|good, but includes negation|geen pijn, geen dispneu, geen hoesten||
|verrichting |decent|lokale anesthesie, lichamelijk onderzoek, palpatie|erg (elektroretinografie), weken, post (peritoneale transfer van eicel en sperma), beleid (management)|
|substantie|decent|nebivolol, amlodipine, ceftriaxon|wortel, PM (fijnstof)|
|omgeving|decent|ziekenhuis, polikliniek, huis, afdeling cardiologie, afdeling fysiologie|meer, stroop, plaats|
|lichaamsstructuur|decent|pols, aortaklep, AV-knoop|mid (mesioincisodistale vlakken van gebitselement)|
|persoon|decent|patient, dochter, vader|bekende (kennis)|
|fysische kracht|decent, but rare|druk||
|fysiek object|discutable, but rare|pacemaker|verband|
|beroep|bad||rechter, belang (behanger), herkende (werkende)|
|metadata|bad||beeld|
|gebeurtenis|bad, only 1 term matched||het (hoog energetisch trauma)|

Terms that were never found: omgeving/locatie, organisme, physical object: fout, religie/filosofie, product, disorder, navigatieconcept, lifestyle, procedure, gradering, tumorgradering, beoordelingsschaal, inactief concept, speciaal concept, ras, foundation metadata concept, physical object, eigenschap, celstructuur

In [19]:
# Define list of relevant terms for general use
relevant_tuis = ['aandoening',
                'monster',
                'regime/therapie',
                'bevinding',
                'afwijkende morfologie',
                'cel',
                'gegevensobject',
                'verrichting',
                'substantie',
                'lichaamsstructuur']

In [20]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].tui.value_counts()

aandoening               185873
verrichting              120781
lichaamsstructuur        100172
bevinding                 67630
substantie                10734
afwijkende morfologie     10231
regime/therapie            5298
monster                    3917
cel                        1624
gegevensobject              473
Name: tui, dtype: int64

In [21]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].shape[0]

506733

## Output

In [22]:
# Save to file
df_all_unique.to_csv('04_ConceptDB/snomedct-dutch_v1.2-unfiltered.csv', index=False)
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].to_csv('04_ConceptDB/snomedct-dutch_v1.2.csv', index=False)