In [20]:
import os 
import json
import numpy as np
import pandas as pd
import plotly.express as px
from langchain.embeddings import OpenAIEmbeddings
import sqlalchemy
from sqlalchemy import create_engine

from pgvector.sqlalchemy import Vector
from sqlalchemy import create_engine, insert, select, text, Integer, String, Text
from sqlalchemy.orm import declarative_base, mapped_column, Session

np.set_printoptions(threshold=np.inf)

In [12]:
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY=os.environ["OPENAI_API_KEY"]
POSTGRES_DBURI=os.environ["POSTGRES_DBURI"]

In [36]:
engine = create_engine(POSTGRES_DBURI)
with engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    conn.commit()

Base = declarative_base()

MODEL = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

class Document(Base):
    __tablename__ = 'openai_brief_summaries'
    
    nct_id = mapped_column(String, primary_key=True)
    brief_title = mapped_column(Text)
    official_title = mapped_column(Text)
    baseline_measurements = mapped_column(Text)
    brief_summaries = mapped_column(Text)
    detailed_descriptions = mapped_column(Text)
    criteria = mapped_column(Text)
    gender = mapped_column(Text)
    minimum_age = mapped_column(Text)
    maximum_age = mapped_column(Text)
    facilities = mapped_column(Text)
    city = mapped_column(Text)
    state = mapped_column(Text)
    zip = mapped_column(Text)
    country = mapped_column(Text)
    recruitment_details = mapped_column(Text)
    pre_assignment_details = mapped_column(Text)
    study_type = mapped_column(Text)
    embedding = mapped_column(Vector(1536))

Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)


def _generate_embedding(text=None, model=None):
    assert model is not None
    embedding = model.embed_documents([text])
    embedding = np.array(embedding).squeeze()
    return embedding


def search(
    text,
    topk=5,
    orient="records",
):
    # Construct the embedding for the query
    text_embeddding = _generate_embedding(text, model=MODEL)
    emb_str = np.array2string(
        text_embeddding,
        precision=10,
        separator=',',
        max_line_width=np.inf,
        suppress_small=False
    )

    # Query the database
    sql_query = sqlalchemy.text(
        """SELECT
           *
           FROM (
            SELECT 
              *,
              (embedding <-> :emb_str) as score 
            FROM openai_brief_summaries
           ) res
           ORDER BY score DESC
           LIMIT 1000
        """
    )
    ).bindparams(emb_str=emb_str)

    # Execute the SQL query and create a pandas DataFrame from the result
    df = pd.read_sql_query(sql_query, engine)
    
    results = dict(similarity=df.head(topk).to_dict(orient=orient))
    return results

In [37]:
df  = search(text="leukemia", orient="dict")
df

{'similarity': {'nct_id': {},
  'brief_title': {},
  'official_title': {},
  'baseline_measurements': {},
  'brief_summaries': {},
  'detailed_descriptions': {},
  'criteria': {},
  'gender': {},
  'minimum_age': {},
  'maximum_age': {},
  'facilities': {},
  'city': {},
  'state': {},
  'zip': {},
  'country': {},
  'recruitment_details': {},
  'pre_assignment_details': {},
  'study_type': {},
  'embedding': {}}}