In [2]:
import os
import openai
import numpy as np
import pandas as pd
import faiss
import textstat
import json
import re

In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")
questions_df = pd.read_csv('./data/resident_request_questions.csv')
context_df = pd.read_csv('./data/dc_service_requests.csv')

In [5]:
def get_embedding(text, model="text-embedding-ada-002"):
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return np.array(response.data[0].embedding)

In [7]:
display(context_df)

Unnamed: 0,request_type,department,resolution_estimate,Description
0,Abandoned Bicycle,"DPW, DPW",20 bd,This service request is to be used for bicycle...
1,Abandoned Vehicle - On Private Property,"DPW, DPW",45 bd,Please use this service request to request the...
2,Abandoned Vehicle - On Public Property,"DPW, DPW",13 bd,Please use this service request to request the...
3,Alley Repair Investigation,DDOT,270 bd,Please use this service request type to invest...
4,Bee Treatment and Inspection (DOH),DOH,14 bd,Bee Treatment - This service request is limite...
...,...,...,...,...
82,Tree Inspection,DDOT,5 bd,Use this request type to report an urgent tree...
83,Tree Planting,DDOT,500 bd,Urban Forestry Administration (UFA) plants nea...
84,Tree Pruning,DDOT,180 bd,Please use this service type to request a publ...
85,Tree Removal,DDOT,180 bd,Please use this service type to request the re...


In [6]:
def res_estimate_helper(res_estimate):
    resolution_estimate = res_estimate.split(' ')[0]
    bd_or_cd = res_estimate.split(' ')[1]
    resolution_estimate += ' business days' if bd_or_cd == 'bd' else ' calendar days'
    return resolution_estimate

In [7]:
all_embeddings = []

for idx, row in context_df.iterrows():
    combined_text = (
        f"Request Type: {row['request_type']}\n"
        f"Department: {row['department']}\n"
        f"Resolution Estimate: {res_estimate_helper(row['resolution_estimate'])}\n"
        f"Description: {row['Description']}"
    )
    
    embed_vec = get_embedding(combined_text)
    all_embeddings.append(embed_vec)

all_embeddings = np.array(all_embeddings)

In [None]:
all_embeddings = all_embeddings.astype(np.float32)
embedding_dim = all_embeddings.shape[1]

faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(all_embeddings)
faiss.write_index(faiss_index, "./faiss/dc_requests.faiss")

In [None]:
## To simply load pre-calculated, run this:
faiss_index = faiss.read_index("./faiss/dc_requests.faiss")

In [9]:
def search_dc_requests(query: str, top_k: int = 3):
    query_vec = get_embedding(query).astype(np.float32).reshape(1, -1)
    # Search FAISS index
    distances, indices = faiss_index.search(query_vec, top_k)

    results = []
    for rank, idx in enumerate(indices[0]):
        row_data = context_df.iloc[idx]
        dist = distances[0][rank]
        results.append({
            "request_type": row_data["request_type"],
            "department": row_data["department"],
            "resolution_estimate": res_estimate_helper(row_data["resolution_estimate"]),
            "description": row_data["Description"],
            "distance": float(dist),
        })
    return results

In [10]:
query = "How long does it take to fix a pothole?"
dc_matches = search_dc_requests(query, top_k=3)
print(dc_matches)

[{'request_type': 'Pothole', 'department': 'DDOT', 'resolution_estimate': '3 business days', 'description': 'Please use this request type for Pothole investigation. Pothole repairs normally take approximately 3 business days (72 hours), weather permitting, for completion.\n\n', 'distance': 0.23399491608142853}, {'request_type': 'Roadway Repair', 'department': 'DDOT', 'resolution_estimate': '270 business days', 'description': 'Please use this service request type to investigate street surface issues. Please provide the specific location (i.e. address, intersection) and describe the specific repair problem (i.e. uneven pavement, numerous potholes). Also if possible, provide any information regarding the street surfaces paving material (i.e. concrete, asphalt, or brick).\n\n', 'distance': 0.3189466595649719}, {'request_type': 'Alley Repair Investigation', 'department': 'DDOT', 'resolution_estimate': '270 business days', 'description': 'Please use this service request type to investigate a

In [11]:
def generate_response(user_query):
    # 1. Retrieve relevant requests from context
    dc_results = search_dc_requests(user_query, top_k=3)
    
    # 2. Create context string
    context_lines = []
    for res in dc_results:
        context_lines.append(
            f"Request Type: {res['request_type']}\n"
            f"Department: {res['department']}\n"
            f"Resolution Estimate: {res['resolution_estimate']}\n"
            f"Description: {res['description']}\n"
            f"Distance: {res['distance']}\n"
            "----"
        )
    context_str = "\n".join(context_lines)
    
    # 3. Build the final prompt
    SYSTEM_PROMPT = "You are a QA system that assists with resident inquiries and service requests in Washington D.C."
    FINAL_PROMPT = (
        f"{SYSTEM_PROMPT}\n\n"
        f"Context from Washington D.C. service requests:\n{context_str}\n\n"
        f"User's question: {user_query}\n"
        f"Provide clear, concise, and legally compliant responses."
        f"Make sure your response is easily readable and understandable by a layman." 
        f"If the answer doesn't belong to one of the request types, state that you're not sure."
        f"Answer format:"
        f"- The content of your answer"
        f"- Used request type: The request type you used"
    )
    
    # 4. Call API
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": FINAL_PROMPT},
        ],
        temperature=0.3
    )
    answer = response.choices[0].message.content
    
    return answer

In [12]:
def check_completeness(ai_response, required_fields):
    """
    A naive completeness check that ensures the AI response mentions 
    certain keywords or phrases. 'required_fields' can be a list of items 
    we expect to see in the answer (e.g., department, resolution, etc.).

    Returns True if all required fields appear, otherwise False.
    """
    response_lower = ai_response.lower()
    for field in required_fields:
        if field.lower() not in response_lower:
            return False
    return True

In [13]:
def reprompt_for_correctness(query, ai_response, context_info):
    """
    Calls the LLM again to check whether the AI's answer is correct 
    given the context_info (e.g., the row from dc_service_requests).
    Returns a dict with "is_correct" and "revised_answer" or similar fields.
    
    For demonstration, we do a ChatCompletion call that we parse.
    In production, you might want more robust JSON parsing or error handling.
    """

    system_prompt = "You are a QA system verifying correctness of the AI’s response."
    user_prompt = f"""
User Query: {query}

AI Response:
{ai_response.split('Used request type:')[0]}

Relevant Context (from official data):
{context_info}

Task:
1. Check if the AI's response is factually correct and consistent with the context.
2. If incorrect or incomplete, propose a corrected version.
3. Return your findings in the following JSON format:
{{
  "is_correct": true or false,
  "revised_answer": "text"
}}
"""

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.0,
    )

    content = response.choices[0].message.content.strip()

    # Attempt to parse a JSON-like structure from the content
    # We'll do a simple regex to find a JSON block, then use Python's `json` if well-formed
    try:
        # find a JSON substring
        json_match = re.search(r"\{.*\}", content, flags=re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
            parsed = json.loads(json_str)
            return parsed
        else:
            return {
                "is_correct": False,
                "revised_answer": "Could not parse JSON from LLM response"
            }
    except Exception as e:
        return {
            "is_correct": False,
            "revised_answer": f"Error parsing LLM output: {str(e)}"
        }

In [14]:
def evaluate_response_with_rules(query, ai_response, request_type):
    """
    Checks if the AI response obeys known rules from context_df and overall readability guidelines.
    Returns a dictionary with flags, metrics, and/or suggested corrections.
    """
    evaluation_result = {
        "flesch_reading_ease": None, 
        "gunning_fog": None,
        "potential_request_types": None,
        "rt_complete": True,
        "resolution_estimate_complete": True,
        "complete": True
    }

    # 1. Check readability
    evaluation_result['flesch_reading_ease'] = textstat.flesch_reading_ease(ai_response)
    evaluation_result['gunning_fog'] = textstat.gunning_fog(ai_response)
    
    # 2. Check if request type is in the context
    search_matches = search_dc_requests(query, top_k=3)
    potential_request_types = [match['request_type'].lower() for match in search_matches]
    if request_type.lower() not in potential_request_types:
        evaluation_result.update({"rt_complete": False, "resolution_estimate_complete": False, "complete": False})
        index_of_req = -1
    else:
        index_of_req = potential_request_types.index(request_type.lower())

    evaluation_result["potential_request_types"] = potential_request_types

    # 3. Check if the estimated resolution time is stated in the answer
    known_resolution_estimates = []
    for elem in search_matches:
        known_resolution_estimates.append(elem["resolution_estimate"])
    
    if index_of_req > -1:
        ai_resolution_estimate = known_resolution_estimates[index_of_req]
        ai_resolution_days = ai_resolution_estimate.split(' ')[0]
        if (ai_resolution_days not in ai_response or 
        (ai_resolution_days == '1' and not any(x in ai_response for x in ['1', 'one']))):
            evaluation_result.update({"resolution_estimate_complete": False, "complete": False})
    

    # 4. Re-Prompt to Check Correctness
    context_lines = []
    for res in search_matches:
        context_lines.append(
            f"Request Type: {res['request_type']}\n"
            f"Department: {res['department']}\n"
            f"Resolution Estimate: {res['resolution_estimate']}\n"
            f"Description: {res['description']}\n"
            f"Distance: {res['distance']}\n"
            "----"
        )
    context_str = "\n".join(context_lines)
    correctness_check = reprompt_for_correctness(query, ai_response, context_str)
    
    evaluation_result["is_correct"] = correctness_check.get("is_correct")
    evaluation_result["revised_answer"] = correctness_check.get("revised_answer")

    return evaluation_result

In [15]:
user_query = "Do I need a permit to build a fence around my yard?"
# user_query = "I started seeing lots of dead rats on my street. What can I do about this?"
# user_query = "There is always trash left in front of my building. What can I do about this?"

ai_answer = generate_response(user_query)
try:
    request_type = ai_answer.split("Used request type: ")[1]
except:
    request_type = 'Not found'

eval_result = evaluate_response_with_rules(user_query, ai_answer, request_type)

print("User Query:", user_query)
print("AI Answer:", ai_answer)
print("\nEVALUATION OF AI RESPONSE:\n")

print("READABILITY:")
print(f"Flesch reading ease score: {eval_result['flesch_reading_ease']}")
print(f"Gunning fog index: {eval_result['gunning_fog']}")

print(f"\nPotential request types: {eval_result['potential_request_types']}")

print("\nCOMPLETENESS:")
if not eval_result["complete"]:
    print("Flagged Response: The AI response is not complete.")
    if not eval_result['resolution_estimate_complete']:
        print("The response does not include the estimated time for resolution of the request.")
    if not eval_result['rt_complete']:
        print("The response does not state which request type the request belongs to. The question might not be in the context.")
else:
    print('AI response is complete.')

print("\nCORRECTNESS:")
if not eval_result['is_correct']:
    print("The answer might not be correct. Here's the revised response:")
    print(eval_result['revised_answer'])
else:
    print('AI response seems to be correct.')

User Query: Do I need a permit to build a fence around my yard?
AI Answer: You may need a permit to build a fence around your yard in Washington D.C. It is advisable to contact the Department of Consumer and Regulatory Affairs (DCRA) to inquire about the specific requirements for building a fence on your property. They can provide guidance on whether a permit is necessary based on the height and location of the fence.

Used request type: Not sure

EVALUATION OF AI RESPONSE:

READABILITY:
Flesch reading ease score: 54.93
Gunning fog index: 12.67

Potential request types: ['dob - illegal construction', 'residential parking permit violation', 'resident parking permit']

COMPLETENESS:
Flagged Response: The AI response is not complete.
The response does not include the estimated time for resolution of the request.
The response does not state which request type the request belongs to. The question might not be in the context.

CORRECTNESS:
AI response seems to be correct.


### Now, let's run the model and evaluation on the simulated dataset.

In [28]:
resident_questions = list(questions_df['question'])
results_df = pd.DataFrame(columns=['question', 'ai_response', 'flesch_reading_ease', 'gunning_fog', 'rt_complete', 're_complete', 'complete', 'correct', 'revised_answer'])

for i, question in enumerate(resident_questions):
    cur_dict = {}
    ai_answer = generate_response(question)
    try:
        request_type = ai_answer.split("Used request type: ")[1]
    except:
        request_type = 'Not found'

    eval_result = evaluate_response_with_rules(question, ai_answer, request_type)

    flesch_re, gunning_fog = eval_result['flesch_reading_ease'], eval_result['gunning_fog']
    rt_complete, re_complete, complete = eval_result['rt_complete'], eval_result['resolution_estimate_complete'], eval_result['complete']
    correct = eval_result['is_correct']
    revised_answer = eval_result['revised_answer'] if not correct else 'Correct'

    cur_dict = {
        'question': question,
        'ai_response': ai_answer,
        'flesch_reading_ease': flesch_re,
        'gunning_fog': gunning_fog,
        'rt_complete': rt_complete,
        're_complete': re_complete,
        'complete': complete,
        'correct': correct,
        'revised_answer': revised_answer
    }
    dict_df = pd.DataFrame([cur_dict])

    results_df = pd.concat([results_df, dict_df], ignore_index=True)

display(results_df)


    

  results_df = pd.concat([results_df, dict_df], ignore_index=True)


Unnamed: 0,question,ai_response,flesch_reading_ease,gunning_fog,rt_complete,re_complete,complete,correct,revised_answer
0,How quickly can the city respond to a request ...,The city can respond to a request for insect t...,68.77,6.22,True,True,True,True,Correct
1,What type of insects are typically treated by ...,The city typically treats bees and rodents as ...,57.77,9.84,False,False,False,True,Correct
2,Is there a limit on the number of times a resi...,Residents in Washington D.C. can request insec...,57.47,10.61,True,False,False,True,Correct
3,How long will it take to receive a new trash c...,It will take approximately 20 business days to...,50.33,9.80,True,True,True,True,Correct
4,Is there a limit to the number of trash carts ...,There is no specific limit mentioned in the se...,44.34,15.62,True,False,False,True,Correct
...,...,...,...,...,...,...,...,...,...
352,How can I report a serious medication error th...,To report a serious medication error that you ...,21.70,14.79,False,False,False,True,Correct
353,What are the potential consequences of a serio...,I'm not sure which request type this question ...,86.71,4.00,False,False,False,False,The AI response is not relevant to the user qu...
354,How can I request a modification to the roadwa...,To request a modification to the roadway marki...,48.09,8.61,True,True,True,True,Correct
355,What is the process for reviewing and approvin...,The process for reviewing and approving roadwa...,37.40,12.91,True,True,True,True,Correct


In [29]:
results_df.to_csv('./data/resident_request_ai_answers.csv', index=True)

In [None]:
results_df = pd.read_csv('./data/resident_request_ai_answers.csv')

42.71871148459384

In [5]:
fre_mean = np.mean(results_df['flesch_reading_ease'])
gunning_fog_mean = np.mean(results_df['gunning_fog'])

In [18]:
print(f"Regarding the readability of the responses, the mean flesch reading ease score is: {round(fre_mean, 3)}, and the mean gunning fog value is: {round(gunning_fog_mean, 3)}.")

total_count = results_df.shape[0]

correct_count = results_df.correct.sum()
perc_correct = correct_count / total_count * 100.0
print(f"\nBased on re-prompting the AI responses, the percentage of correct responses is: {round(perc_correct, 3)}%")

complete_count = results_df.complete.sum()
perc_complete = complete_count / total_count * 100.0
print(f"\nThe percentage of complete responses is: {round(perc_complete, 3)}%")

Regarding the readability of the responses, the mean flesch reading ease score is: 42.719, and the mean gunning fog value is: 12.72.

Based on re-prompting the AI responses, the percentage of correct responses is: 92.997%

The percentage of complete responses is: 27.171%
