In [2]:
import json
import time
from collections import Counter
from pathlib import Path

import pysolr
from tqdm import tqdm

from cistem import stem
from data_loading import load_search_terms
from models import Match
from config import TLCPaths

In [2]:
solr_special_chars = '+ - && || ! ( ) { } [ ] ^ " ~ * ? : /'.split()

In [3]:
solr = pysolr.Solr('http://localhost:8983/solr/wumls-single-valued-deduplicated',
                   always_commit=False)
# solr = pysolr.Solr('http://localhost:8983/solr/wumls-multi-valued', always_commit=False)
solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":1,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all",\n      "rid":"localhost-90297"}},\n  "status":"OK"}\n'

In [5]:
distances = {True: 1, False: 2}


def add_tilde_to_search_terms(term):
    return " AND ".join([s + f"~{distances[len(s) < 4]}" for s in term.split(" ")])


def create_query_with_edit_distance(term):
    return "{!func}strdist('" + term + "',index_term,edit)"

In [46]:
results = solr.search(q=create_query_with_edit_distance(term="pyelonephriti"), fl="*,score")
for res in results:
    print(res)

{'cui': ['C0034186'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['Pyelonephritis'], 'index_term': ['pyelonephriti'], 'id': 'c8b1ada1-826d-46fa-a6fb-186add6196a1', '_version_': 1760194257981800490, 'score': 1.0}
{'cui': ['C0034188'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['Xanthogranulomatoese Pyelonephritis'], 'index_term': ['xanthogranulomato pyelonephriti'], 'id': 'c01e43d7-a3e5-482d-800a-73b49306f2b7', '_version_': 1760194257982849026, 'score': 1.0}
{'cui': ['C1697444'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['virale Pyelonephritis'], 'index_term': ['viral pyelonephriti'], 'id': 'c1a048cc-3de9-4231-9820-688e08951389', '_version_': 1760194263649353728, 'score': 1.0}
{'cui': ['C0034188'], 'source': ['MSHGER'], 'language': ['GER'], 'name': ['Pyelonephritis, xanthogranulomatöse'], 'index_term': ['pyelonephritis, xanthogranulomato'], 'id': '1729c369-87a4-4b8a-b12c-1953689789be', '_version_': 1760194257982849025, 'score': 0.9285714}
{'cui': ['C1328529'],

In [43]:
search_term = "pyelonephriti"
# results = solr.search(q=f"index_term_str:{add_tilde_to_search_terms(search_term)}", fl="*", rows=10)
results = solr.search(q=f"index_term:{search_term}", fl="*", rows=10)
for res in results:
    print(res)

{'cui': ['C0034186'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['Pyelonephritis'], 'index_term': ['pyelonephriti'], 'id': 'c8b1ada1-826d-46fa-a6fb-186add6196a1', '_version_': 1760194257981800490}
{'cui': ['C0022667'], 'source': ['MSHGER'], 'language': ['GER'], 'name': ['Nekrotisierende Pyelonephritis'], 'index_term': ['nekrotisier pyelonephriti'], 'id': '0fd0839d-370c-4761-8ea2-676e0c654d9f', '_version_': 1760194257740627992}
{'cui': ['C0034186'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['Pyelonephritis NNB'], 'index_term': ['pyelonephriti nnb'], 'id': 'e4a173ad-9b0d-419f-9fd2-66c9a1329c99', '_version_': 1760194257982849024}
{'cui': ['C0034188'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['Xanthogranulomatoese Pyelonephritis'], 'index_term': ['xanthogranulomato pyelonephriti'], 'id': 'c01e43d7-a3e5-482d-800a-73b49306f2b7', '_version_': 1760194257982849026}
{'cui': ['C0085697'], 'source': ['MDRGER'], 'language': ['GER'], 'name': ['chronische Pyelonephritis'

In [8]:
search_terms = load_search_terms()

In [35]:
search_terms.terms[0]

SearchTerm(annotation=Annotation(tech_term=None, lay_term='Blähungen', type=<TermType.LAY: 'LAY'>, span_start=86, span_end=95, synonyms=['Flatulenzen'], id=6992), stems={'blaehung', 'blah', 'luftansammlung im darm', 'meteorismu', 'flatulenz', 'blahung', 'wind', 'meteorisnu'})

In [39]:
len({stem(term.annotation.get_mention()) for term in search_terms.terms})

1468

In [47]:
def filter_results_by_common_cuis(results):
    cuis = [d['cui'][0] for d in results]
    counter = Counter(cuis)
    most_common_match_cui = counter.most_common()[0][0]
    most_common_match = next(d for d in results if d['cui'][0] == most_common_match_cui)
    return most_common_match


def filter_results_by_highest_score(results):
    score = 0
    top_result = None
    for result in results:
        if result["score"] > score:
            top_result = result
            score = result["score"]
    return top_result


all_results = []
for term in tqdm(search_terms.terms):
    # simple, not weighted majority vote for cui
    term_results = []
    for stem in term.stems:
        try:
            res = solr.search(q=create_query_with_edit_distance(term=stem), fl="*,score", rows=1)
        except pysolr.SolrError:
            continue
        if not res:
            continue
        top_result = list(res)[0]
        top_result['stem'] = stem
        term_results.append(top_result)
    all_results.append(term_results)

100%|██████████| 7390/7390 [1:58:41<00:00,  1.04it/s]  


In [None]:
matched, not_matched = [], []
for term, result in zip(search_terms.terms, all_results):
    if results:
        # top_match = filter_results_by_common_cuis(results=results)
        top_match = filter_results_by_highest_score(results=results)
        matched.append(Match(mention=term, match=top_match, matched_string=top_match['stem']))
    else:
        not_matched.append(Match(mention=term))

In [None]:
print(f"{len(matched)=}")
print(f"{len(not_matched)=}")

In [10]:
path = TLCPaths.project_data_path.joinpath(f'matched_solr_{time.strftime("%Y%m%d-%H%M%S")}.json')
with open(path, 'w') as fp:
    json.dump([match.json() for match in matched], fp)

In [11]:
path = TLCPaths.project_data_path.joinpath(f'not_matched_solr_{time.strftime("%Y%m%d-%H%M%S")}.json')
with open(path, 'w') as fp:
    json.dump([match.json() for match in not_matched], fp)