# Notebook for preprocessing and cleaning CUI

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np
from tqdm import tqdm
import json
import matplotlib
import matplotlib.pyplot as plt
import re
from collections import Counter

In [4]:
# Clean CUI from parsing mistakes
with open('sids_to_clamp_cuis.json','r') as infile:
    cuidata = json.load(infile)
cleaned_data = dict()
for k,v in cuidata.items():
    cuis = []
    semtype = []
    presence = []
    for cui, s, p in zip(v['cuis'], v['presence'], v['semtype']):
        if re.match(r"C\d{7}", cui):
            cuis.append(cui)
            semtype.append(s)
            presence.append(p)

    cleaned_data[k] = {'cuis': cuis, 'semtype':semtype, 'presence':presence}

print(len(cleaned_data))

29865


In [5]:
# with open("sids_to_cuis_cleaned.json",'w') as outfile:
#     json.dump(cleaned_data, outfile)

# find dominant presences for each cui

In [7]:
present = Counter()
absent = Counter()
dominant_presences = dict()
for i,doc in tqdm(enumerate(cleaned_data), total=len(cleaned_data)):
    for cui, presence in zip(cuidata[doc]['cuis'], cuidata[doc]['presence']):
        if presence == "present":
            present.update([cui])
        else:
            absent.update([cui])
        dominant_presences[cui] = "temp"

100%|██████████████████████████████████████████████████████████████████████████| 29865/29865 [00:13<00:00, 2191.65it/s]


In [9]:
print(len(present))
print(len(absent))

101991
40063


In [10]:
for cui in dominant_presences.keys():
    if present[cui] >= absent[cui]:
        dominant_presences[cui] = True
    else:
        dominant_presences[cui] = False

In [12]:
i = 0
for k, v in dominant_presences.items():
    if v:
        i += 1
print(i)
print(list(dominant_presences.items())[:10])

98976
[('C0008031', True), ('C1507320', False), ('C0015031', True), ('C4718442', True), ('C2707412', True), ('C0281822', False), ('C0043250', True), ('C0398266', True), ('C3244243', True), ('C0018802', True)]


In [13]:
# with open("dominant_cui_presence.json",'w') as outfile:
#     json.dump(dominant_presences, outfile)

# Filter doc data to only have dominant cuis

In [20]:
filtered_docs = dict()
for i,doc in tqdm(enumerate(cleaned_data), total=len(cleaned_data)):
    cuis = []
    for cui, presence in zip(cuidata[doc]['cuis'], cuidata[doc]['presence']):
        if dominant_presences[cui] == (presence == "present"):
            cuis.append(cui)
    filtered_docs[doc] = cuis

100%|██████████████████████████████████████████████████████████████████████████| 29865/29865 [00:03<00:00, 8575.39it/s]


In [21]:
with open("filtered_sids_to_cuis.json",'w') as outfile:
    json.dump(filtered_docs, outfile)

In [22]:
print(len(filtered_docs))

29865
