In [47]:
import os
import pandas as pd
import time

from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load environment variables from the secrets.env file.
load_dotenv("../secrets.env")

# Retrieve API keys from environment variables.
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")
openai_api_key = os.getenv("OPENAI_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(pinecone_index_name)

In [23]:
# View index stats of our new, empty index
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'healthcare-providers': {'vector_count': 4100},
                'ns1': {'vector_count': 5}},
 'total_vector_count': 4105,
 'vector_type': 'dense'}

In [None]:
dataset = pd.read_csv('../data/processed/sample_ny_data.csv', dtype=str).head(100)

# Add id column based on index
dataset['id'] = dataset.index.astype(str)  # Ensure ID is also string type

# Handle potential NaN values during string operations
for col in ['Rndrng_Prvdr_First_Name', 'Rndrng_Prvdr_MI', 'Rndrng_Prvdr_Last_Org_Name']:
    dataset[col] = dataset[col].fillna('').astype(str)

# Create text column by concatenating relevant fields
dataset['text'] = (
    'Provider: ' + dataset['Rndrng_Prvdr_Last_Org_Name'].fillna('') + ' ' + 
    dataset['Rndrng_Prvdr_First_Name'].fillna('') + ' ' +
    'Credentials: ' + dataset['Rndrng_Prvdr_Crdntls'].fillna('') + ' ' +
    'Address: ' + dataset['Rndrng_Prvdr_St1'].fillna('') + ' ' +
    dataset['Rndrng_Prvdr_City'].fillna('') + ' ' +
    dataset['Rndrng_Prvdr_State_Abrvtn'].fillna('') + ' ' +
    'Procedure: ' + dataset['HCPCS_Cd'].fillna('') + ' - ' +
    dataset['HCPCS_Desc'].fillna('') + ' ' +
    'Place of Service: ' + dataset['Place_Of_Srvc'].fillna('')
)


In [None]:
MODEL       = "llama-text-embed-v2"
EMB_PARAMS  = {"input_type": "passage", "truncate": "END"}
BATCH_EMB   = 96     # max inputs per embed call
BATCH_UPS   = 100    # your existing upsert batch size

texts = dataset['text'].tolist()
records_buffer = []

for start in range(0, len(texts), BATCH_EMB):
    # 1) embed a slice of at most 96 texts
    slice_texts = texts[start : start + BATCH_EMB]
    resp = pc.inference.embed(
        model=MODEL,
        inputs=slice_texts,
        parameters=EMB_PARAMS
    )

    # 2) build Pinecone records for this slice
    slice_df = dataset.iloc[start : start + BATCH_EMB]
    for (idx, row), embed_resp in zip(slice_df.iterrows(), resp):
        first = row['Rndrng_Prvdr_First_Name'] or ""
        mi    = row['Rndrng_Prvdr_MI']         or ""
        last  = row['Rndrng_Prvdr_Last_Org_Name'] or ""

        full_name = " ".join(filter(None, [first, mi, last]))

        records_buffer.append({
            "id": str(row['id']),
            "values": embed_resp['values'],
            "metadata": {
                'npi':            row['Rndrng_NPI'],
                'provider_name':  full_name,
                'procedure_code': row['HCPCS_Cd'],
                'procedure_desc': row['HCPCS_Desc'],
                'location':       f"{row['Rndrng_Prvdr_City']}, {row['Rndrng_Prvdr_State_Abrvtn']}"
            }
        })


    # 3) once we’ve collected BATCH_UPS records, send an upsert
    if len(records_buffer) >= BATCH_UPS:
        index.upsert(vectors=records_buffer[:BATCH_UPS],
                     namespace="healthcare-providers")
        records_buffer = records_buffer[BATCH_UPS:]
        time.sleep(0.5)

# 4) flush any remaining records
if records_buffer:
    index.upsert(vectors=records_buffer,
                 namespace="healthcare-providers")

In [None]:
results = index.search(
    namespace="healthcare-providers", 
    query={
        "inputs": {"text": "Which healthcare providers offer physical therapy"}, 
        "top_k": 3
    },
    fields=["procedure_desc", "provider_name"]
)

print(results)

{'result': {'hits': [{'_id': '2419',
                      '_score': 0.25532227754592896,
                      'fields': {'procedure_desc': 'Administration of '
                                                   'pneumococcal vaccine',
                                 'provider_name': 'Kph Healthcare Services, '
                                                  'Inc.'}},
                     {'_id': '2552',
                      '_score': 0.25532227754592896,
                      'fields': {'procedure_desc': 'Influenza vaccine, '
                                                   'quadrivalent inactivated, '
                                                   '0.5 ml dosage',
                                 'provider_name': 'Kph Healthcare Services, '
                                                  'Inc.'}},
                     {'_id': '3250',
                      '_score': 0.25532227754592896,
                      'fields': {'procedure_desc': 'Influenza vaccine, '
             

In [21]:
def create_embedding(query):
    from openai import OpenAI

    # Get OpenAI api key from platform.openai.com
    openai_api_key = os.getenv('OPENAI_API_KEY') or 'sk-...'

    # Instantiate the OpenAI client
    client = OpenAI(api_key=openai_api_key)

    # Create an embedding
    res = client.embeddings.create(
      model="text-embedding-ada-002",
      input=[query],
    )
    return res.data[0].embedding

query = (
    "Which training method should I use for sentence transformers when " +
    "I only have pairs of related sentences?"
)

xq = create_embedding(query)

# Retrieve from Pinecone
# Get relevant contexts (including the questions)
query_results = index.query(vector=xq, top_k=2, include_metadata=True)
query_results

## Building a chat completion prompt with relevant context

## Next, we write some functions to retrieve these relevant contexts from Pinecone and incorporate them into a richer chat completion prompt.

def retrieval_augmented_prompt(query):
    context_limit = 3750
    xq = create_embedding(query)

    # Get relevant contexts
    query_results = index.query(vector=xq, top_k=3, include_metadata=True)
    contexts = [
        x.metadata['text'] for x in query_results.matches
    ]

    # Build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    context_separator = "\n\n---\n\n"

    # Join contexts and trim to fit within limit
    combined_contexts = []
    total_length = 0
    
    for context in contexts:
        new_length = total_length + len(context) + len(context_separator)
        if new_length >= context_limit:
            break
        combined_contexts.append(context)
        total_length = new_length
    
    return prompt_start + context_separator.join(combined_contexts) + prompt_end

prompt_with_context = retrieval_augmented_prompt(query)
print(prompt_with_context)


Answer the question based on the context below.

Context:


Question: Which training method should I use for sentence transformers when I only have pairs of related sentences?
Answer:


In [None]:
## Generating knowledgeable answers with RAG
## Now that we are building a rich prompt with context from our index, we are ready to get chat completions from OpenAI.

def chat_completion(prompt):
    from openai import OpenAI

    # Get OpenAI api key from platform.openai.com
    openai_api_key = os.getenv('OPENAI_API_KEY') or 'sk-...'

    # Instantiate the OpenAI client
    client = OpenAI(api_key=openai_api_key)
    
    # Instructions
    sys_prompt = "You are a helpful assistant that always answers questions."
    res = client.chat.completions.create(
        model='gpt-4o-mini-2024-07-18',
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    return res.choices[0].message.content.strip()

def rag(query):
    prompt = retrieval_augmented_prompt(query)
    return chat_completion(prompt)

query = (
    "Which training method should I use for sentence transformers when " +
    "I only have pairs of related sentences?"
)

# Now we can get completions for a context-infused query
answer = rag(query)
print(answer)

When you only have pairs of related sentences, you should use a contrastive learning approach or a Siamese network architecture for training sentence transformers. This method allows the model to learn to differentiate between similar and dissimilar sentence pairs by minimizing the distance between embeddings of related sentences while maximizing the distance for unrelated ones. You can also consider using techniques like triplet loss if you have a third sentence to compare against.


In [28]:
import time

index_name = "langchain-test-index"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

#### Langchain Implementation

In [50]:
import os
import pandas as pd
import time

from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

from langchain_pinecone import PineconeVectorStore, PineconeEmbeddings
from langchain_openai import OpenAIEmbeddings


# Load environment variables from the secrets.env file.
load_dotenv("../secrets.env")

# # Retrieve API keys from environment variables.
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")
# pinecone_index_name = "langchain-test-index"  # openai embeddings

pc = Pinecone(api_key=pinecone_api_key)
# index = pc.Index(pinecone_index_name)

# embeddings = OpenAIEmbeddings(model="text-embedding-3-large") # if we want to use OpenAI embeddings
embeddings = PineconeEmbeddings(model="multilingual-e5-large") # if we want to use Pinecone embeddings

vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings, namespace="healthcare-providers")

In [59]:
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

index.upsert_records(
    namespace="healthcare-providers",
    records=data
)

In [51]:
vectorstore.index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'healthcare-providers': {'vector_count': 4100},
                'ns1': {'vector_count': 5}},
 'total_vector_count': 4105,
 'vector_type': 'dense'}

In [None]:
index = pc.Index(index_name)  
index.describe_index_stats()  
for batch in dataset.iter_documents(batch_size=100):  
    index.upsert(batch)  

In [None]:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQA  
from langchain_ollama import ChatOllama

#llm = ChatOllama(model="llama2", temperature=0.0)
# completion llm  
llm = ChatOpenAI(  
    openai_api_key=openai_api_key,  
    model_name='gpt-3.5-turbo',  
    temperature=0.0
)  
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  

Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.


{'query': 'Providers',
 'result': "I'm not sure what you're asking about providers. Could you please provide more context or clarify your question?"}

In [75]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vectorstore.add_documents(documents=documents, ids=uuids)

['01f995fe-ecdb-4e61-b704-f86ed7ed730e',
 'd48b8eec-ccc1-412a-bcd7-5b04e9938f9c',
 '7938cb38-ec11-4469-a76b-66eb44131066',
 '532cf120-ce66-4ebd-82c0-7bf1a1ec7660',
 '52bbba1b-8ea0-405f-8279-8056edfc087b',
 '82772654-8bbb-414c-b77f-92df0876b11a',
 '64616ece-2c3f-4547-b6be-a27213a15add',
 'a952a038-28a5-4680-9248-16e44875087b',
 '146a9651-eb15-44b4-81d4-c90eedc963bb',
 '7844d594-9bdc-4440-b859-ded8853bf11b']

In [83]:
from langchain_community.document_loaders import DataFrameLoader

dataset = pd.read_csv('../data/processed/sample_ny_data.csv', dtype=str).head(100)

# Add id column based on index
dataset['id'] = dataset.index.astype(str)  # Ensure ID is also string type
dataset = dataset.fillna('')  # Replace all NaN values with empty strings

# Handle potential NaN values during string operations
for col in ['Rndrng_Prvdr_First_Name', 'Rndrng_Prvdr_MI', 'Rndrng_Prvdr_Last_Org_Name']:
    dataset[col] = dataset[col].fillna('').astype(str)

# Create text column by concatenating relevant fields
dataset['text'] = (
    'Provider: ' + dataset['Rndrng_Prvdr_Last_Org_Name'].fillna('') + ' ' + 
    dataset['Rndrng_Prvdr_First_Name'].fillna('') + ' ' +
    'Credentials: ' + dataset['Rndrng_Prvdr_Crdntls'].fillna('') + ' ' +
    'Address: ' + dataset['Rndrng_Prvdr_St1'].fillna('') + ' ' +
    dataset['Rndrng_Prvdr_City'].fillna('') + ' ' +
    dataset['Rndrng_Prvdr_State_Abrvtn'].fillna('') + ' ' +
    'Procedure: ' + dataset['HCPCS_Cd'].fillna('') + ' - ' +
    dataset['HCPCS_Desc'].fillna('') + ' ' +
    'Place of Service: ' + dataset['Place_Of_Srvc'].fillna('')
)


loader = DataFrameLoader(dataset, page_content_column="text")

In [84]:
documents = loader.load()
documents

[Document(metadata={'Rndrng_NPI': '1740287010', 'Rndrng_Prvdr_Last_Org_Name': 'Lituchy', 'Rndrng_Prvdr_First_Name': 'Andrew', 'Rndrng_Prvdr_MI': 'E', 'Rndrng_Prvdr_Crdntls': 'MD', 'Rndrng_Prvdr_Gndr': 'M', 'Rndrng_Prvdr_Ent_Cd': 'I', 'Rndrng_Prvdr_St1': '100 Port Washington Blvd', 'Rndrng_Prvdr_St2': '', 'Rndrng_Prvdr_City': 'Roslyn', 'Rndrng_Prvdr_State_Abrvtn': 'NY', 'Rndrng_Prvdr_State_FIPS': '36', 'Rndrng_Prvdr_Zip5': '11576', 'Rndrng_Prvdr_RUCA': '1', 'Rndrng_Prvdr_RUCA_Desc': 'Metropolitan area core: primary flow within an urbanized area of 50,000 and greater', 'Rndrng_Prvdr_Cntry': 'US', 'Rndrng_Prvdr_Type': 'Interventional Cardiology', 'Rndrng_Prvdr_Mdcr_Prtcptg_Ind': 'Y', 'HCPCS_Cd': '93880', 'HCPCS_Desc': 'Ultrasound of both sides of head and neck blood flow', 'HCPCS_Drug_Ind': 'N', 'Place_Of_Srvc': 'O', 'Tot_Benes': '186', 'Tot_Srvcs': '188', 'Tot_Bene_Day_Srvcs': '188', 'Avg_Sbmtd_Chrg': '719', 'Avg_Mdcr_Alowd_Amt': '243.94696809', 'Avg_Mdcr_Pymt_Amt': '173.23590426', 'Avg_

In [85]:
uuids = [str(uuid4()) for _ in range(len(documents))]

vectorstore.add_documents(documents=documents, ids=uuids)

['1b946ff6-3b98-4631-81c2-ab15e0306dc7',
 'fb6f7c93-8b39-4606-95eb-c20c29a59d4e',
 'b242a5ec-f1c1-4695-83b7-dbeedef981f0',
 '6d5c828a-a873-4b76-b63d-6419bfccc57d',
 'b25cbe6f-3913-4599-9fc3-c20aee9b210d',
 'ee27d32e-3f65-45a5-aff9-4c4b61017683',
 '4bf4b4be-d8aa-49af-b897-72ba838403e8',
 'c0b6c4db-0b3f-4e64-b308-661d4c8dfcd9',
 '7689e671-181b-4687-841b-c27f35715442',
 '22da62f0-db11-4d1a-ae5d-8cc6bbd55f21',
 '1feb77ba-aaaa-48ed-bf95-aff4b51a4338',
 '640642cd-503f-4101-912d-aaecfbf83886',
 '865fd374-19b5-4ee0-a115-265f70ea588d',
 '53b3f690-68a5-4e74-839a-73d083c4a12d',
 '765d9f67-4e0b-479a-a3b3-f6f8d151b8a3',
 '90d7fd5b-ac63-48ae-832f-645dfa8ff2dd',
 '2c8d5c80-f8d3-41be-82e2-a47e61156e07',
 '02731fb7-9821-42c9-ab7d-be1470b45d6e',
 '04c95f7d-73b8-4f84-b960-bd967d4c8edf',
 '7d67cdb1-48cb-4e45-b22e-2cee6071ffa4',
 'c93519b2-a3ad-4dcc-aa23-cedd972f4aed',
 '1aec5ce7-5cc7-4101-9620-1dfe38b38ce6',
 'a796c991-b7e6-40aa-a9a3-d925197227fc',
 'db55b10c-968f-4674-8b49-a687bcf15960',
 '5fcb20c1-1e84-

In [86]:
results = vectorstore.similarity_search(
    "What can you tell me about Gaines Evan?",
    k=2,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Provider: Gaines Evan Credentials: MD Address: 350 S Main St New City NY Procedure: J0702 - Injection, betamethasone acetate 3 mg and betamethasone sodium phosphate 3 mg Place of Service: O [{'Avg_Mdcr_Alowd_Amt': '6.5784615385', 'Avg_Mdcr_Pymt_Amt': '5.2453846154', 'Avg_Mdcr_Stdzd_Amt': '5.2315384615', 'Avg_Sbmtd_Chrg': '23', 'HCPCS_Cd': 'J0702', 'HCPCS_Desc': 'Injection, betamethasone acetate 3 mg and betamethasone sodium phosphate 3 mg', 'HCPCS_Drug_Ind': 'Y', 'Place_Of_Srvc': 'O', 'Rndrng_NPI': '1548414725', 'Rndrng_Prvdr_City': 'New City', 'Rndrng_Prvdr_Cntry': 'US', 'Rndrng_Prvdr_Crdntls': 'MD', 'Rndrng_Prvdr_Ent_Cd': 'I', 'Rndrng_Prvdr_First_Name': 'Evan', 'Rndrng_Prvdr_Gndr': 'M', 'Rndrng_Prvdr_Last_Org_Name': 'Gaines', 'Rndrng_Prvdr_MI': 'B', 'Rndrng_Prvdr_Mdcr_Prtcptg_Ind': 'Y', 'Rndrng_Prvdr_RUCA': '1', 'Rndrng_Prvdr_RUCA_Desc': 'Metropolitan area core: primary flow within an urbanized area of 50,000 and greater', 'Rndrng_Prvdr_St1': '350 S Main St', 'Rndrng_Prvdr_St2': ''

In [71]:
results = vectorstore.similarity_search_with_score(
    "Will it be hot tomorrow?", k=1, filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.837730] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [87]:
query = "What can you tell me about Gaines Evan?"

In [88]:
qa.invoke(query)

{'query': 'What can you tell me about Gaines Evan?',
 'result': 'Gaines Evan is an MD located at 350 S Main St in New City, NY. The procedure associated with Gaines Evan is J0702 - Injection, betamethasone acetate 3 mg and betamethasone sodium phosphate 3 mg, and the place of service is O.'}