In [1]:
import collections
import json
import re
import sys

import pandas as pd

sys.path.insert(0, '../')
import map_modifiers

In [2]:
parents_df = (
    pd.read_csv(
        '../data/computed/parents_synonyms.tsv', 
        sep='\t',
        dtype={
            'parent_concept_id': int,
            'parent_concept_name': str,
            'parent_concept_code': int,
            'concept_synonym_name': str,
        }
    )
)

parents_df.head(2)

Unnamed: 0,parent_concept_id,parent_concept_name,parent_concept_code,concept_synonym_name
0,4039266,Dry skin,16386004,Anhydrotic skin
1,4039266,Dry skin,16386004,Dry skin (finding)


In [3]:
par_to_syn_df = (
    pd.read_csv(
        '../data/computed/parent_to_descendant_synonyms_codes.tsv', 
        sep='\t',
        dtype={
            'parent_concept_code': int,
            'descendant_concept_code': int,
            'descendant_synonym_name': str,
        }
    )
)

par_to_syn_df.head(2)

Unnamed: 0,parent_concept_code,descendant_concept_code,descendant_synonym_name
0,16386004,68637004,Xeroderma pigmentosum group D
1,16386004,68637004,"Xeroderma pigmentosum, group D"


## Map from parent code to all candidate strings

Children and synonyms

In [4]:
def prune_disorder_finding(synonym_set):
    copied = synonym_set.copy()
    for child in list(synonym_set):
        if child not in copied:
            continue
        if not re.search('\(disorder\)|\(finding\)', child):
            continue
        if re.sub(' \(disorder\)| \(finding\)', '', child) in copied:
            copied.remove(child)
    return copied

In [5]:
# Get parent code to its own synonyms
parent_to_its_synonyms = (
    parents_df
    .groupby('parent_concept_code')
    ['concept_synonym_name']
    .apply(set)
    .to_dict()
)

for parent_code, synonyms in parent_to_its_synonyms.items():
    synonyms = [
        map_modifiers.utils.normalize_text(synonym)
        for synonym in synonyms
    ]
    parent_to_its_synonyms[parent_code] = set(prune_disorder_finding(synonyms))

In [6]:
# Get parent code to child synonyms
parent_to_children_syn = (
    par_to_syn_df
    .groupby('parent_concept_code')
    ['descendant_synonym_name']
    .apply(set)
    .to_dict()
)

for parent_code, synonyms in parent_to_children_syn.items():
    synonyms = [
        map_modifiers.utils.normalize_text(synonym)
        for synonym in synonyms
    ]
    parent_to_children_syn[parent_code] = set(prune_disorder_finding(synonyms))

In [7]:
# Combine to get parent code -> all candidates
parent_to_candidates = collections.defaultdict(set)
for dictionary in [parent_to_its_synonyms, parent_to_children_syn]:
    for parent_code, candidates_set in dictionary.items():
        parent_to_candidates[parent_code] = parent_to_candidates[parent_code].union(candidates_set)
        
parent_to_candidates = {k: list(v) for k, v in parent_to_candidates.items()}

with open('../map_modifiers/parent_to_candidates.json', 'w') as f:
    json.dump(parent_to_candidates, f, indent=2, sort_keys=True)

## Map from each candidate to its concept code

In [8]:
child_candidate_to_code = (
    par_to_syn_df
    .assign(
        descendant_synonym_name=lambda df: df['descendant_synonym_name'].apply(
            map_modifiers.utils.normalize_text
        )
    )
    .groupby('descendant_synonym_name')
    ['descendant_concept_code']
    .apply(set)
    .to_dict()
)

parent_candidate_to_code = (
    parents_df
    .assign(
        concept_synonym_name=lambda df: df['concept_synonym_name'].apply(
            map_modifiers.utils.normalize_text
        )
    )
    .groupby('concept_synonym_name')
    ['parent_concept_code']
    .apply(set)
    .to_dict()
)

In [9]:
# Combine to get candidate -> SNOMED code for parents and children
candidate_to_code = collections.defaultdict(set)
for dictionary in [child_candidate_to_code, parent_candidate_to_code]:
    for candidate, codes in dictionary.items():
        candidate_to_code[candidate] = candidate_to_code[candidate].union(codes)
    
candidate_to_code = {k: list(v) for k, v in candidate_to_code.items()}

with open('../map_modifiers/candidate_to_code.json', 'w') as f:
    json.dump(candidate_to_code, f, indent=2, sort_keys=True)