In [1]:
import json
import time
from collections import Counter

import pysolr
from tqdm import tqdm
from config import TLCPaths
from models import Match
from data_loading import load_search_terms

In [2]:
solr = pysolr.Solr('http://localhost:8983/solr/wumls-single-valued-deduplicated',
                   always_commit=False)
# solr = pysolr.Solr('http://localhost:8983/solr/wumls-multi-valued', always_commit=False)
solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":1,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all",\n      "rid":"localhost-990"}},\n  "status":"OK"}\n'

In [3]:
def create_query_with_edit_distance(term):
    return "{!func}strdist('" + term + "',index_term,edit)"


def filter_results_by_common_cuis(results):
    cuis = [d['cui'][0] for d in results]
    counter = Counter(cuis)
    most_common_match_cui = counter.most_common()[0][0]
    most_common_match = next(d for d in results if d['cui'][0] == most_common_match_cui)
    return most_common_match


def filter_results_by_highest_score(results):
    score = 0
    top_result = None
    for result in results:
        if result["score"] > score:
            top_result = result
            score = result["score"]
    return top_result


In [4]:
search_term = "pyelonephriti"
results = solr.search(q=f"index_term:{search_term}", fl="*, score", rows=2)
for res in results:
    print(res)

{'cui': ['C0034186'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['Pyelonephritis'], 'index_term': ['pyelonephriti'], 'id': 'c8b1ada1-826d-46fa-a6fb-186add6196a1', '_version_': 1760194257981800490, 'score': 5.497966}
{'cui': ['C0022667'], 'source': ['MSHGER'], 'language': ['GER'], 'name': ['Nekrotisierende Pyelonephritis'], 'index_term': ['nekrotisier pyelonephriti'], 'id': '0fd0839d-370c-4761-8ea2-676e0c654d9f', '_version_': 1760194257740627992, 'score': 4.649174}


In [6]:
with open('search_terms_single_and_ids.json', 'r') as fp:
    terms_and_ids = json.load(fp)

In [9]:
all_results = []
for stem in tqdm(terms_and_ids):
    try:
        res = solr.search(q=create_query_with_edit_distance(term=stem), fl="*,score", rows=1)
    except pysolr.SolrError:
        continue
    top_result = list(res)[0]
    top_result['stem'] = stem
    all_results.append(top_result)

100%|██████████| 997/997 [09:12<00:00,  1.81it/s]


In [10]:
all_results[0]

{'cui': ['C0016204'],
 'source': ['MDRGER'],
 'language': ['GER'],
 'name': ['Flatulenz'],
 'index_term': ['flatulenz'],
 'id': '724154c9-63d4-4658-ba0d-d6b1708574bf',
 '_version_': 1760194257559224321,
 'score': 1.0,
 'stem': 'flatulenz'}

In [11]:
sum(['cui' not in res for res in all_results])

0

In [12]:
matched, not_matched = [], []
for result in tqdm(all_results):
    
    if result:
        matched.append(result)
    else:
        not_matched.append(result) # solr always matches

100%|██████████| 997/997 [00:00<00:00, 1121104.85it/s]


In [13]:
print(f"{len(matched)=}")
print(f"{len(not_matched)=}")

len(matched)=997
len(not_matched)=0


In [14]:
path = TLCPaths.project_data_path.joinpath(f'matched_solr_{time.strftime("%Y%m%d-%H%M%S")}.json')
with open(path, 'w') as fp:
    json.dump([match for match in matched], fp)

In [15]:
path = TLCPaths.project_data_path.joinpath(
    f'not_matched_solr_{time.strftime("%Y%m%d-%H%M%S")}.json')
with open(path, 'w') as fp:
    json.dump([match for match in not_matched], fp)