In [1]:
import csv
from collections import defaultdict
from pydantic import BaseModel
from typing import List
from cistem import  stem
import json
from collections import Counter
from models import IntGenerator
from pydantic import Field


In [2]:
wumls_file = '/home/tim/MedicalLay/WUMLS/MRCONSO_WUMLS_GER.RRF'


## multi valued search fields in solr

In [3]:
class MultiValuedEntry(BaseModel):
    cui: str
    source: str
    language: str
    names: List[str]
    index_terms: List[str] = []

In [13]:
entries = {}

i = 0
with open(wumls_file, newline='\n') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='|')
    for row in spamreader:
        cui = row[0]
        language = row[1]
        source = row[11]
        name = row[14]
        if cui in entries:
            entries[cui].names.append(name)
        else:
            entry = MultiValuedEntry(cui=cui, language=language, names=[name], source=source)
            entries[cui] = entry
        
        # if i == 10:
        #     break
        # i += 1

In [5]:
for cui, entry in entries.items():
    index_terms = []
    for name in entry.names:
            index_terms.append(" ".join([stem(word) for word in name.split(" ")]))
    entry.index_terms.extend(set(index_terms))
    entry.index_terms.extend(set([name.lower() for name in entry.names])) 

In [7]:
with open("/home/tim/MedicalLay/WUMLS/wumls_index_terms_multi_valued.json", "w") as fp:
    json.dump([entry.dict() for entry in entries.values()], fp)

## single valued search fields in solr

In [4]:
class Entry(BaseModel):
    # id: int = Field(default_factory=IntGenerator())
    cui: str
    source: str
    language: str
    name: str
    index_term: str

In [103]:
entries = []

i = 0
with open(wumls_file, newline='\n') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='|')
    for row in spamreader:
        cui = row[0]
        language = row[1]
        source = row[11]
        name = row[14]
        
        index_term = " ".join([stem(word) for word in name.split(" ")])
        entry = Entry(cui=cui, language=language, name=name, source=source, index_term=index_term)
        entries.append(entry)
        # if i == 10:
        #     break
        # i += 1

In [104]:
entries[0]

Entry(cui='C0018563', source='WIKTIONARY', language='GER', name='hand', index_term='hand')

In [105]:
with open("/home/tim/MedicalLay/WUMLS/wumls_index_terms_single_valued.json", "w") as fp:
    json.dump([entry.dict() for entry in entries], fp)


In [106]:
uniques = {}
duplicates = defaultdict(list)
for entry in entries:
    if entry.index_term in uniques:
        if len(duplicates[entry.index_term]) == 0:
            duplicates[entry.index_term].append(uniques[entry.index_term])
        duplicates[entry.index_term].append(entry)
    else:
        uniques[entry.index_term] = entry

In [107]:
deduplicated = []
for name, single_duplicates in duplicates.items():
    cuis = [entry.cui for entry in single_duplicates]
    cui_counts = Counter(cuis)
    # print(cui_counts)
    most_common_cui = list(cui_counts)[0]
    common_entry = next(entry for entry in single_duplicates if entry.cui == most_common_cui)
    deduplicated.append(common_entry)
    

In [108]:
for entry in deduplicated:
    uniques[entry.index_term] = entry

In [109]:
print(len(uniques), len(deduplicated))
mean_dup_len = sum([len(x) for x in duplicates.items()]) / len(duplicates)
print(mean_dup_len)

178719 30726
2.0


In [110]:
# for term, entry in uniques.items():
#     dup = [d_entry for d_entry in uniques.values() if d_entry.name == term and entry != d_entry]
#     if dup:
#         print(entry, dup)

In [111]:
with open("/home/tim/MedicalLay/WUMLS/wumls_index_terms_single_valued_deduplicated.json", "w") as fp:
    json.dump([entry.dict() for entry in uniques.values()], fp)

In [112]:
entry.dict()

{'cui': 'C4087539',
 'source': 'MDRGER',
 'language': 'GER',
 'name': 'Schlafstoerung durch Schichtarbeit',
 'index_term': 'schlafstoerung durch schichtarbeit'}

## document style valued search fields

In [5]:
entries = {}

i = 0
with open(wumls_file, newline='\n') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='|')
    for row in spamreader:
        cui = row[0]
        language = row[1]
        source = row[11]
        name = row[14]
        if cui in entries:
            entries[cui].names.append(name)
        else:
            entry = MultiValuedEntry(cui=cui, language=language, names=[name], source=source)
            entries[cui] = entry

In [6]:
for cui, entry in entries.items():
    index_terms = []
    for name in entry.names:
            index_terms.append(" ".join([stem(word) for word in name.split(" ")]))
    entry.index_terms.extend(set(index_terms))
    entry.index_terms.extend(set([name.lower() for name in entry.names])) 

In [10]:
doc_entries = []
for cui, entry in entries.items():
    joined_names = ", ".join(entry.names)
    joined_index_terms = ", ".join(entry.index_terms)
    entry = Entry(cui=cui, language=entry.language, source=entry.source, name=joined_names, index_term=joined_index_terms)
    doc_entries.append(entry)

In [11]:
entry.index_term

'transferbericht:ergebnis:zeitpunkt:{setting}:dokument:hals-nasen-ohrenheilku, transferbericht:ergebnis:zeitpunkt:{setting}:dokument:hals-nasen-ohrenheilkunde'

In [12]:
with open("/home/tim/MedicalLay/WUMLS/wumls_index_terms_single_doc_valued.json", "w") as fp:
    json.dump([entry.dict() for entry in doc_entries], fp)