In [1]:
import pandas as pd

from data_loading import load_wumls_entries

In [2]:
entries = load_wumls_entries()

In [3]:
df = pd.DataFrame([entry.dict() for entry in entries])

In [4]:
df.columns

Index(['cui', 'source', 'language', 'name', 'index_term'], dtype='object')

In [5]:
df["source"].value_counts()


MDRGER         99061
MSHGER         80864
DMDICD10       11864
LNC-DE-DE      11059
LNC-DE-CH       4941
DMDUMD          3373
WHOGER          3332
WIKTIONARY      3082
ICPCGER          716
LNC-DE-AT        188
OMIM               1
SNOMEDCT_US        1
SPN                1
Name: source, dtype: int64

In [6]:
wumls_table_data = []
for x, y in df["source"].value_counts().items():
    wumls_table_data.append(
        dict(ontology=x, name_count=y, concept_count=df["cui"].loc[df["source"] == x].nunique()))
wumls_table_data.append(dict(ontology="Total", name_count=df["source"].value_counts().sum(),
                             concept_count=df["cui"].nunique()))
table = pd.DataFrame.from_records(wumls_table_data).to_latex(position='h', index=False,
                                                             header=["Ontology", "Name Count",
                                                                     "Concept Count"],
                                                             label="tab:wumls_ontologies",
                                                             column_format='|lrr|')
print(table)

\begin{table}[h]
\centering
\label{tab:wumls_ontologies}
\begin{tabular}{|lrr|}
\toprule
  Ontology & Name Count & Concept Count \\
\midrule
    MDRGER &      99061 &         52249 \\
    MSHGER &      80864 &         39852 \\
  DMDICD10 &      11864 &         11208 \\
 LNC-DE-DE &      11059 &         11043 \\
 LNC-DE-CH &       4941 &          4941 \\
    DMDUMD &       3373 &          3296 \\
    WHOGER &       3332 &          2733 \\
WIKTIONARY &       3082 &           768 \\
   ICPCGER &        716 &           715 \\
 LNC-DE-AT &        188 &           186 \\
     Total &     218483 &        110121 \\
\bottomrule
\end{tabular}
\end{table}



  table = pd.DataFrame.from_records(wumls_table_data).to_latex(position='h', index=False, header=["Ontology", "Name Count", "Concept Count"], label="tab:wumls_ontologies", column_format='|lrr|')


In [8]:
print("names without wiktionary:", df["name"].loc[df["source"] != "WIKTIONARY"].size)
print("names with wiktionary:", df["name"].loc[df["source"] == "WIKTIONARY"].size)
print("names total:", df["name"].size)


names without wiktionary: 215401
names with wiktionary: 3082
names total: 218483


In [9]:
print("unique concepts without wiktionary:", df["cui"].loc[df["source"] != "WIKTIONARY"].nunique())
print("unique concepts with wiktionary:", df["cui"].loc[df["source"] == "WIKTIONARY"].nunique())
print("unique concepts total:", df["cui"].nunique())


unique concepts without wiktionary: 110086
unique concepts with wiktionary: 768
unique concepts total: 110121


analyze proportions of ontologies that matched mentions

In [10]:
import json
from config import TLCPaths
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

In [12]:
with open(TLCPaths.project_data_path.joinpath(f'matched_solr_20230602-111758.json')) as fp:
    solr_matches = json.load(fp)

with open(TLCPaths.project_data_path.joinpath(f'not_matched_solr_20230602-111759.json')) as fp:
    solr_not_matches = json.load(fp)

In [13]:
def remove_list_values(d: dict):
    for k, v in d.items():
        if isinstance(v, list):
            d[k] = v[0]
    return d


solr_matches = [remove_list_values(d) for d in solr_matches]
solr_not_matches = [remove_list_values(d) for d in solr_not_matches]

In [14]:
solr_matches_df = pd.DataFrame(solr_matches)
solr_not_matches_df = pd.DataFrame(solr_not_matches)

In [15]:
solr_matches_df['source'].value_counts()

MSHGER        328
MDRGER        318
WIKTIONARY    225
WHOGER         52
DMDICD10       38
DMDUMD         20
ICPCGER         9
LNC-DE-CH       4
LNC-DE-DE       3
Name: source, dtype: int64

In [22]:
s = sum(solr_matches_df['source'].value_counts())
for x, y in solr_matches_df['source'].value_counts().items():
print(x, round(y / s * 100, 2))

MSHGER 32.9
MDRGER 31.9
WIKTIONARY 22.57
WHOGER 5.22
DMDICD10 3.81
DMDUMD 2.01
ICPCGER 0.9
LNC-DE-CH 0.4
LNC-DE-DE 0.3


In [17]:
    with open(TLCPaths.project_data_path.joinpath(
        'annotate_tlc/search_terms_single_and_ids_strict_uniqueness.json'), 'r') as fp:
    terms_and_ids = json.load(fp)

In [18]:
match_counts = defaultdict(list)
for match in tqdm(solr_matches):
    try:
        counts = len(terms_and_ids[match['stem']])
    except KeyError:
        counts = len(terms_and_ids[match['stem'].strip()])
    assert counts > 0
    match_counts[match['source']].append(counts)

100%|██████████| 997/997 [00:00<00:00, 1074716.29it/s]


In [19]:
total_counts = sum([sum(counts) for counts in match_counts.values()])
match_counts['Total'] = [total_counts]
for wumls_table_row in wumls_table_data:
    ontology = wumls_table_row['ontology']
    counts = match_counts[ontology]
    share_of_matches = round(sum(counts) / total_counts * 100, 2)
    print(ontology, share_of_matches)

    wumls_table_row['share_of_matches'] = share_of_matches
print("total", total_counts)

MDRGER 24.23
MSHGER 22.29
DMDICD10 2.87
LNC-DE-DE 0.04
LNC-DE-CH 0.46
DMDUMD 1.11
WHOGER 4.77
WIKTIONARY 43.68
ICPCGER 0.55
LNC-DE-AT 0.0
Total 100.0
total 7321


In [20]:
table = pd.DataFrame.from_records(wumls_table_data).to_latex(position='h', index=False,
                                                             header=["Ontology", "Name Count",
                                                                     "Concept Count",
                                                                     "Share of matched concepts (in %)"],
                                                             label="tab:wumls_ontologies",
                                                             column_format='|lrrr|')
print(table)

\begin{table}[h]
\centering
\label{tab:wumls_ontologies}
\begin{tabular}{|lrrr|}
\toprule
  Ontology & Name Count & Concept Count & Share of matched concepts (in \%) \\
\midrule
    MDRGER &      99061 &         52249 &                            24.23 \\
    MSHGER &      80864 &         39852 &                            22.29 \\
  DMDICD10 &      11864 &         11208 &                             2.87 \\
 LNC-DE-DE &      11059 &         11043 &                             0.04 \\
 LNC-DE-CH &       4941 &          4941 &                             0.46 \\
    DMDUMD &       3373 &          3296 &                             1.11 \\
    WHOGER &       3332 &          2733 &                             4.77 \\
WIKTIONARY &       3082 &           768 &                            43.68 \\
   ICPCGER &        716 &           715 &                             0.55 \\
 LNC-DE-AT &        188 &           186 &                             0.00 \\
     Total &     218483 &        110121 & 

  table = pd.DataFrame.from_records(wumls_table_data).to_latex(position='h', index=False, header=["Ontology", "Name Count", "Concept Count", "Share of matched concepts (in %)"], label="tab:wumls_ontologies", column_format='|lrrr|')
