In [None]:
import pandas as pd
import numpy as np
import json
import re
import os

### Choosing file type

| Edition | Total names | NL names | Description |
| - | - | - | - |
| Edition | 2263140 | 736619 | Include international SNOMED |
| Extension | 770016 | 736619 | Some terms are only in English |
| Patient Friendly | 1437 | 1437 | Small but potentially useful list of synonyms |

The "Edition" contains also English names, so for our purpose we will focus on the "Extention" and "Patient Friendly" terms.

In [None]:
def parse_file(filename):
    with open(filename, encoding='utf-8') as f:
        entities = [[n.strip() for n in line.split('\t')] for line in f]
        return pd.DataFrame(entities[1:], columns=entities[0])

In [None]:
snomed_dir = '/Users/stan3/Data/snomed'
nl_ed_term = os.path.join(snomed_dir, 'SnomedCT_Netherlands_EditionRelease_PRODUCTION_20200930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_20200930.txt')
nl_ex_term = os.path.join(snomed_dir, 'SnomedCT_Netherlands_ExtensionRelease_PRODUCTION_20200930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL_20200930.txt')
nl_pf_term = os.path.join(snomed_dir, 'SnomedCT_Netherlands_PatientFriendlyExtensionRelease_PRODUCTION_20200930T120000Z/Snapshot/Terminology/sct2_Description_Snapshot_NL-PatientFriendly_20200930.txt')

### Description type

|Type id|Term|
|-|-|
|900000000000003001|Fully specified name|
|900000000000013009|Synonym|
|900000000000550004|Definition|

We will use the FSN and Synonyms, as well as only dutch terms and terms that are active.

In [None]:
df_ex = parse_file(nl_ex_term)
df_ex = df_ex[(df_ex.languageCode == 'nl') & (df_ex.active == '1')]
# Inactive: 34481
# Active: 702138

df_pf = parse_file(nl_pf_term)
df_pf = df_pf[(df_pf.typeId == '900000000000013009') & (df_pf.active == '1')]
# Inactive: 149
# Active: 1284

# Extract fully specified names
df_fsn = df_ex[(df_ex.typeId == '900000000000003001')]
df_fsn.shape
# Inactive: 15211, 9
# Active: 231646, 9

df_ex_synonyms = df_ex[(df_ex.typeId == '900000000000013009')]

# Extract names and synonyms
df_synonyms = pd.concat([df_ex[(df_ex.typeId == '900000000000013009')],
                         df_pf])

### Primary concepts

In [None]:
df_primary_concepts = df_fsn.copy()
df_primary_concepts['tui'] = df_primary_concepts['term'].str.extract(r"\(([^)]*)\)[^(]*$")
df_primary_concepts['str'] = df_primary_concepts['term'].str.extract(r"(^[^\(]+)")
df_primary_concepts['str'] = df_primary_concepts['str'].str.strip()
df_primary_concepts = df_primary_concepts[['conceptId', 'typeId', 'tui', 'str']]

# Create CUI-TUI mapping
cui_tui_mapping = dict(zip(df_primary_concepts.conceptId, df_primary_concepts.tui))
df_primary_concepts.head()

### Synonyms

In [None]:
# Clean synonym table
df_synonym_concepts = df_synonyms.copy()
df_synonym_concepts = df_synonym_concepts[['conceptId', 'term', 'typeId']]
df_synonym_concepts.rename({'term': 'str'}, inplace=True, axis=1)

# Add TUI to synonyms
df_synonym_concepts['tui'] = df_synonym_concepts.conceptId.map(cui_tui_mapping)
# df_synonym_concepts[df_synonym_concepts['tui'].isna()].shape
# 67 synonyms without type

df_synonym_concepts

### Combined

In [None]:
# Create combined 
df_all = pd.concat([df_primary_concepts, df_synonym_concepts]).reset_index(drop=True)
df_all.rename({'typeId': 'tty', 'conceptId': 'cui'}, inplace=True, axis=1)
df_all.tty.replace({'900000000000003001': 'PN',
                             '900000000000013009': 'SY'}, inplace=True)
df_all['sab'] = 'SNOMED-CT-NL'
df_all = df_all[['cui', 'str', 'tty', 'tui', 'sab']]

# Drop synonyms that are the same as primary name
print(f'Records before dropping duplicates: {df_all.shape[0]}')
df_all_unique = df_all.drop(df_all[(df_all.duplicated(subset=['cui', 'str', 'tui'], keep=False)) & (df_all.tty=='SY')].index)
print(f'Records after dropping duplicates: {df_all_unique.shape[0]}')

# Sort column on cui and tty
df_all_unique['cui'] = df_all_unique['cui'].astype(int)
df_all_unique.sort_values(['cui', 'tty'], inplace=True)

df_all_unique.head()

### Examples

In [None]:
df_all_unique[df_all_unique.str == 'ALS']

In [None]:
df_all_unique[df_all_unique.cui == 86044005]

In [None]:
df_all_unique[df_all_unique.str == 'longkanker']

In [None]:
df_all_unique[df_all_unique.cui == 93880001]

In [None]:
df_all_unique[df_all_unique.cui == 22298006]

In [None]:
df_all_unique[df_all_unique.str == 'methotrexaat']

In [None]:
df_all_unique[df_all_unique.cui == 387381009]

## Evaluation of SNOMED types

Find which types are present, and then manually check the performance of the types by looking at a set of cardiology letters.

In [None]:
df_all_unique.tui.value_counts()

In [None]:
# Total number of concepts
df_all_unique.shape[0]

In [None]:
# Number of primary concepts
df_all_unique[df_all_unique.tty == 'PN'].shape

In [None]:
# Number of synonyms
df_all_unique[df_all_unique.tty == 'SY'].shape

| tui | summary | good examples | bad examples |
| :- | :- | :- | :-|
|aandoening |good|hypertensie, boezemfibrilleren, av-blok| |
|monster|good|trombocyten, leukocyten,basofiele granulocyten||
|regime/therapie|good|fysiotherapie, hartrevalidatie, therapie||
|waarneembare entities|good|leeftijd, bloeddruk, hartas, LVEF||
|bevinding|good|koorts, zwelling, tachycardie||
|attribuut|good|bij, na||
|kwalificatiewaarde|good|ontslag, beloop, gestaakt, geen||
|cel|good, but rare|erythrocytes||
|gegevensobject|good, but rare| Echocardiogram, operatieverslag||
|sociaal concept|good, but rare|familie||
|situatie|good, but includes negation|geen pijn, geen dispneu, geen hoesten||
|verrichting |decent|lokale anesthesie, lichamelijk onderzoek, palpatie|erg (elektroretinografie), weken, post (peritoneale transfer van eicel en sperma), beleid (management)|
|substantie|decent|nebivolol, amlodipine, ceftriaxon|wortel, PM (fijnstof)|
|omgeving|decent|ziekenhuis, polikliniek, huis, afdeling cardiologie, afdeling fysiologie|meer, stroop, plaats|
|lichaamsstructuur|decent|pols, aortaklep, AV-knoop|mid (mesioincisodistale vlakken van gebitselement)|
|persoon|decent|patient, dochter, vader|bekende (kennis)|
|fysische kracht|decent, but rare|druk||
|fysiek object|discutable, but rare|pacemaker|verband|
|afwijkende morfologie|bad|||
|beroep|bad||rechter, belang (behanger), herkende (werkende)|
|metadata|bad||beeld|
|gebeurtenis|bad, only 1 term matched||het (hoog energetisch trauma)|

Terms that were never found: omgeving/locatie, organisme, physical object: fout, religie/filosofie, product, disorder, navigatieconcept, lifestyle, procedure, gradering, tumorgradering, beoordelingsschaal, inactief concept, speciaal concept, ras, foundation metadata concept, physical object, eigenschap, celstructuur

In [None]:
# Define list of relevant terms for general use
relevant_tuis = ['aandoening',
                'monster',
                'regime/therapie',
                'bevinding',
                'cel',
                'gegevensobject',
                'verrichting',
                'substantie',
                'lichaamsstructuur']

In [None]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].tui.value_counts()

In [None]:
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].shape[0]

## Output

In [None]:
# Save to file
df_all_unique.to_csv('04_ConceptDB/snomedct-dutch_v1.0-complete.csv', index=False)
df_all_unique[df_all_unique.tui.isin(relevant_tuis)].to_csv('04_ConceptDB/snomedct-dutch_v1.0.csv', index=False)