In [1]:
import os
import openai
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import faiss

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")
df = pd.read_csv('./data/resident_request_questions.csv')

## 1. Obtain Embeddings

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    response = openai.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    return np.array(embedding)

In [None]:
# Calculate embeddings for question dataset
embeddings = []
for inquiry in df['question']:
    embeddings.append(get_embedding(inquiry))

embeddings = np.vstack(embeddings)

## 2. Vector Search

In [None]:
embeddings_f32 = embeddings.astype(np.float32)
embedding_dim = embeddings_f32.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(embeddings_f32)
faiss.write_index(faiss_index, "./faiss/sample_questions_faiss_index.index")

In [None]:
## To simply load pre-calculated, run this:
faiss_index = faiss.read_index("./faiss/sample_questions_faiss_index.index")

In [13]:
def search_similar_inquiries_faiss(query: str, top_k: int = 3):
    query_vec = get_embedding(query)
    query_vec_f32 = np.array([query_vec]).astype(np.float32)
    distances, indices = faiss_index.search(query_vec_f32, top_k)
    
    results = []
    for rank, idx in enumerate(indices[0]):
        row = df.iloc[idx]
        dist = distances[0][rank]  # L2 distance
        results.append({
            "question": row["question"],
            "request_type": row["request_type"],
            "distance": float(dist)
        })
        
    return results


def search_similar_inquiries(query, top_k=3):
    """
    Given a query, find the top_k similar inquiries from the dataset using cosine similarity.
    """
    query_embedding = get_embedding(query)
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    top_indices = similarities.argsort()[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            "question": df.iloc[idx]["question"],
            "request_type": df.iloc[idx]["request_type"],
            "similarity_score": float(similarities[idx])
        })
    return results

## 3. LLM Response

In [23]:
SYSTEM_PROMPT = """
You are an AI system that answers city residents' questions.
Provide clear, concise, and legally compliant responses.
If applicable, refer to the legal or FAQ documents of one of the following departments of Washington D.C.:
-Parking Enforcement Management Administration
-Urban Forrestry
-Trans Sys Mnt-Signs
-Driver Vehicle Services
-Solid Waste Management Administration
-Transportation Operations Administration
-SNOW
-SIOD
-Tru-311 
-Toa-Street & Bridge Maintenance
-Adjudication Services
-DC Interagency Council on Homelessness
-Department of Energy and Environment
-FEMS-Special Events
-HOMYDRPR- How Is My Driving Program
-Toa- Trans Sys Mnt
-Department Of Health
-Department of Transportation
-FEMS-Smoke Alarms
-Transportation Policy & Planning Administration
-Department of Disability Services 
-Department of Buildings
Mention specific clauses or sentences from documents if possible.
If you are uncertain about the correct answer, politely say so and prompt for clarification.
"""

def generate_response(user_query):
    similar_inq = search_similar_inquiries_faiss(user_query, top_k=2)
    
    context_info = "\n".join(
        [f"- Similar question: {res['question']} (request_type: {res['request_type']})"
         for res in similar_inq]
    )
    final_prompt = f"{SYSTEM_PROMPT}\n\nContext from similar questions:\n{context_info}\n\nUser question: {user_query}\nAI answer:"
    
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": final_prompt},
        ],
        temperature=0.7,
        max_tokens=200
    )
    
    return response.choices[0].message.content

## 4. Evaluation

In [7]:
def evaluate_response(user_query, ai_response):
    """
    A simple heuristic-based approach to check:
    1. Is the response at least X characters? (completeness)
    2. Does the response contain disclaimers if uncertain? (correctness check)
    3. Is the language appropriate for a layperson? (basic readability check)
    
    More sophisticated evaluation would parse the AI response,
    compare to a known FAQ or use another LLM for meta-evaluation.
    """
    evaluation_result = {
        "complete": False,
        "flags": [],
        "message": ""
    }
    
    # 1. Check length (very naive completeness check)
    if len(ai_response) > 50:
        evaluation_result["complete"] = True
    
    # 2. Check for uncertain disclaimers
    if "uncertain" in ai_response.lower() or "clarification" in ai_response.lower():
        # Possibly a correct approach if the question is truly ambiguous
        evaluation_result["flags"].append("uncertainty")
    
    # 3. Basic readability (e.g., count jargon words, or a simpler method)
    # This is a placeholder. In real usage, you might do a Flesch-Kincaid score, etc.
    if any(word in ai_response.lower() for word in ["therefore", "henceforth"]):
        evaluation_result["flags"].append("potentially_complex_language")
    
    # Summarize
    if evaluation_result["complete"]:
        evaluation_result["message"] = "Response meets basic length requirement."
    else:
        evaluation_result["message"] = "Response might be incomplete. Consider rechecking."
    
    return evaluation_result

In [24]:
user_query = "Do I need a permit to build a fence around my yard?"
user_query = "There is always trash left in front of my building. What can I do about this?"

ai_answer = generate_response(user_query)
eval_result = evaluate_response(user_query, ai_answer)

print("User Query:", user_query)
print("AI Answer:", ai_answer)
print("Evaluation:", eval_result)

User Query: There is always trash left in front of my building. What can I do about this?
AI Answer: If there is always trash left in front of your building, you can report this issue to the Solid Waste Management Administration in Washington D.C. They can provide guidance on proper waste disposal and address any sanitation concerns in your area. You can reach out to them for assistance in resolving the trash accumulation in front of your building.
Evaluation: {'complete': True, 'flags': [], 'message': 'Response meets basic length requirement.'}
