In [3]:
from langchain_postgres import PGVector
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
# %docker run --name pgvector-container -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16
# See docker command above to launch a postgres instance with pgvector enabled.

connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "documents"  # Name of the collection to use in the database.

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [15]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader

dataset = pd.read_csv('../data/processed/sample_ny_data.csv', dtype=str)

# Add id column based on index
dataset['id'] = dataset.index.astype(str)  # Ensure ID is also string type
dataset = dataset.fillna('')  # Replace all NaN values with empty strings

# Handle potential NaN values during string operations
for col in ['Rndrng_Prvdr_First_Name', 'Rndrng_Prvdr_MI', 'Rndrng_Prvdr_Last_Org_Name']:
    dataset[col] = dataset[col].fillna('').astype(str)

# Create text column by concatenating relevant fields
dataset['text'] = (
    'Provider: ' + dataset['Rndrng_Prvdr_Last_Org_Name'].fillna('') + ' ' + 
    dataset['Rndrng_Prvdr_First_Name'].fillna('') + ' ' +
    'Credentials: ' + dataset['Rndrng_Prvdr_Crdntls'].fillna('') + ' ' +
    'Address: ' + dataset['Rndrng_Prvdr_St1'].fillna('') + ' ' +
    dataset['Rndrng_Prvdr_City'].fillna('') + ' ' +
    dataset['Rndrng_Prvdr_State_Abrvtn'].fillna('') + ' ' +
    'Procedure: ' + dataset['HCPCS_Cd'].fillna('') + ' - ' +
    dataset['HCPCS_Desc'].fillna('') + ' ' +
    'Place of Service: ' + dataset['Place_Of_Srvc'].fillna('')
)


loader = DataFrameLoader(dataset, page_content_column="text")

In [16]:
documents = loader.load()
documents

[Document(metadata={'Rndrng_NPI': '1740287010', 'Rndrng_Prvdr_Last_Org_Name': 'Lituchy', 'Rndrng_Prvdr_First_Name': 'Andrew', 'Rndrng_Prvdr_MI': 'E', 'Rndrng_Prvdr_Crdntls': 'MD', 'Rndrng_Prvdr_Gndr': 'M', 'Rndrng_Prvdr_Ent_Cd': 'I', 'Rndrng_Prvdr_St1': '100 Port Washington Blvd', 'Rndrng_Prvdr_St2': '', 'Rndrng_Prvdr_City': 'Roslyn', 'Rndrng_Prvdr_State_Abrvtn': 'NY', 'Rndrng_Prvdr_State_FIPS': '36', 'Rndrng_Prvdr_Zip5': '11576', 'Rndrng_Prvdr_RUCA': '1', 'Rndrng_Prvdr_RUCA_Desc': 'Metropolitan area core: primary flow within an urbanized area of 50,000 and greater', 'Rndrng_Prvdr_Cntry': 'US', 'Rndrng_Prvdr_Type': 'Interventional Cardiology', 'Rndrng_Prvdr_Mdcr_Prtcptg_Ind': 'Y', 'HCPCS_Cd': '93880', 'HCPCS_Desc': 'Ultrasound of both sides of head and neck blood flow', 'HCPCS_Drug_Ind': 'N', 'Place_Of_Srvc': 'O', 'Tot_Benes': '186', 'Tot_Srvcs': '188', 'Tot_Bene_Day_Srvcs': '188', 'Avg_Sbmtd_Chrg': '719', 'Avg_Mdcr_Alowd_Amt': '243.94696809', 'Avg_Mdcr_Pymt_Amt': '173.23590426', 'Avg_

In [18]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents))]

# after doing some research it appears that there is a psycopg3 driver limit to parameters (65,535) so we need to batch the inserts
batch_size = 100  # Adjust as needed
for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    batch_ids = uuids[i:i+batch_size]
    vector_store.add_documents(documents=batch_docs, ids=batch_ids)

In [19]:
results = vector_store.similarity_search(
    "What can you tell me about Gaines Evan?",
    k=2,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Provider: Gaines Evan Credentials: MD Address: 350 S Main St New City NY Procedure: 99213 - Established patient office or other outpatient visit, 20-29 minutes Place of Service: O [{'id': '25372', 'HCPCS_Cd': '99213', 'Tot_Benes': '297', 'Tot_Srvcs': '481', 'HCPCS_Desc': 'Established patient office or other outpatient visit, 20-29 minutes', 'Rndrng_NPI': '1548414725', 'Place_Of_Srvc': 'O', 'Avg_Sbmtd_Chrg': '441.03950104', 'HCPCS_Drug_Ind': 'N', 'Rndrng_Prvdr_MI': 'B', 'Rndrng_Prvdr_St1': '350 S Main St', 'Rndrng_Prvdr_St2': '', 'Avg_Mdcr_Pymt_Amt': '84.782515593', 'Rndrng_Prvdr_City': 'New City', 'Rndrng_Prvdr_Gndr': 'M', 'Rndrng_Prvdr_RUCA': '1', 'Rndrng_Prvdr_Type': 'Orthopedic Surgery', 'Rndrng_Prvdr_Zip5': '10956', 'Avg_Mdcr_Alowd_Amt': '108.52484407', 'Avg_Mdcr_Stdzd_Amt': '70.5504158', 'Rndrng_Prvdr_Cntry': 'US', 'Tot_Bene_Day_Srvcs': '481', 'Rndrng_Prvdr_Ent_Cd': 'I', 'Rndrng_Prvdr_Crdntls': 'MD', 'Rndrng_Prvdr_RUCA_Desc': 'Metropolitan area core: primary flow within an urban

In [20]:
query = "What can you tell me about Gaines Evan?"

In [21]:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQA  
from langchain_ollama import ChatOllama

llm = ChatOllama(model="deepseek-r1", temperature=0.0)
# completion llm  
# llm = ChatOpenAI(  
#     openai_api_key=openai_api_key,  
#     model_name='gpt-3.5-turbo',  
#     temperature=0.0
# )  
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vector_store.as_retriever()  
)  

qa.invoke(query)

{'query': 'What can you tell me about Gaines Evan?',
 'result': '<think>\nOkay, so I need to figure out what the user is asking for regarding "Gaines Evan." Looking at the context provided, there are two providers named Gaines Evan. One is a MD with an address in New City NY, and the other seems to be PA, but wait, no—the second one\'s name is Evans Korey, so that might have been a typo or mistake.\n\nWait, let me check again. The first provider is Gaines Evan with MD credentials, and the second is Evans Korey with PA credentials. So both are named Evans but different genders—Evan\'s and Korey\'s? Or maybe it\'s a name variation. But in the context given, only Gaines Evan has two procedures listed: one for an intra-articular injection of Hyaluronan or derivative (hymovis) and another for a prosthetic repair of shoulder joint.\n\nSo, the user is asking about Gaines Evan. From the context, I can gather that he\'s a medical professional, specifically a MD, based in New City NY. He has per