In [42]:
from huggingface_hub import snapshot_download
from txtai.workflow import Workflow
from txtai.workflow import Task
from txtai.pipeline import Tabular
from txtai.pipeline import Similarity
from txtai.embeddings import Embeddings
import os
import json 

from pprint import pprint
from geopy.distance import great_circle
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def get_geocode(place_name):
    geolocator = Nominatim(user_agent="my_geocoder")

    try:
        location = geolocator.geocode(place_name)
        if location:
            return location
        else:
            print(f"Location not found: {place_name}")
            return None
    except GeocoderTimedOut:
        print("GeocoderTimedOut: Retrying...")
        return get_geocode(place_name)
    
example = get_geocode("San Francisco")
print(example.latitude)
print(example.longitude)

37.7790262
-122.419906


In [47]:
class SemanticSearch(object):
    def __init__(
        self,
        filename="ctgov_34983_20230417",
        columns=[
            "brief_title",
            "official_title",
            "brief_summaries",
            "detailed_descriptions",
            "criteria",
            "city",
            "state",
            "zip",
            "country"
        ],
        ckptlist=[
            "sentence-transformers/multi-qa-mpnet-base-dot-v1",
        ],
        rerun=False,
    ):
        self.filename = filename
        self.columns = columns
        self.ckptlist = ckptlist

        for ckptpath in self.ckptlist:
            snapshot_download(repo_id=ckptpath,
                              repo_type="model",
                              cache_dir="cache")
            self.embeddings = Embeddings({
                "method": "transformers",
                "path": ckptpath,
                "content": True,
                "object": True
            })
            indexfile = f'{filename}_{ckptpath.replace("/", "-")}.index'
            pprint(indexfile)
            if os.path.exists(indexfile) and rerun is False:
                print("Indexed and Cached!")
                self.embeddings.load(indexfile)
            else:
                print("Need to rerun or Indices and Caches don't exist, run them!")

                # Create tabular instance mapping input.csv fields
                tabular = Tabular(idcolumn="nct_id", textcolumns=columns, content=True)

                # Create workflow
                workflow = Workflow([Task(tabular)])

                # Index subset of CORD-19 data
                data = list(workflow([f'{filename}.csv']))
                self.embeddings.index(data)
                self.embeddings.save(indexfile)
                print("Indexing and Caching finished for the 1st time!")

    def search(self, prompt, patient_location, max_distance):
        query = f'select {", ".join([column for column in self.columns])} from txtai where similar({prompt})'
        results = self.embeddings.search(query)
        print(results)
        # Filter results based on distance
        filtered_results = []
        for result in results:
            trial_location = result["city"]
            distance = self.haversine_distance(patient_location, trial_location)
            print(distance)
            if distance <= max_distance:
                result["distance"] = distance
                filtered_results.append(result)

        return filtered_results

    def haversine_distance(self, loc1, loc2, unit="kilometer"):
        loc1_latlong = get_geocode(loc1)
        loc2_latlong = get_geocode(loc2)

        if loc1_latlong and loc2_latlong:
            if unit=="kilometer":
                return great_circle((loc1_latlong.latitude, loc1_latlong.longitude),
                                    (loc2_latlong.latitude, loc2_latlong.longitude)).kilometers
        else:
            return float("inf")


In [52]:
prompt = "obesity"
patient_location = "Paris"
max_distance = 2000  # In kilometers

search_instance = SemanticSearch()

results = search_instance.search(prompt, patient_location, max_distance)
pprint(results)
# for result in results:
#     print(json.dumps(result, default=str, indent=2))


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

'ctgov_34983_20230417_sentence-transformers-multi-qa-mpnet-base-dot-v1.index'
Indexed and Cached!
[{'brief_title': 'The Effects of Exercise on Dietary Intake', 'official_title': 'The Effects of Exercise on Energy Balance and Macronutrient Intake in College-aged Males', 'brief_summaries': 'The purpose of this study is to determine whether or not energy and macronutrient intake changes when comparing habitually active and habitually sedentary college-aged males in two different sessions of one resting and one exercise.', 'detailed_descriptions': 'Obesity is a consequence of positive energy balance, in which more energy is consumed than expended. It is hypothesized that regular amounts of physical activity may be important in helping individuals more accurately regulate energy balance. More research is needed to understand the impact of a physically active lifestyle on energy regulation capabilities Therefore, this study looks at sedentary college aged males compared to active college age