## Installation

In [None]:
!pip install google-generativeai==0.8.4 --no-deps
!pip install hnswlib==0.7.0 --no-deps
!pip install tqdm --no-deps
!pip install fsspec==2024.10.0 --no-deps
# Issue Log: dependency conflict between NumPy and scikit-learn
!pip install scikit-learn==1.3.2 --no-deps
!pip install numpy==1.23.5 --no-deps
# Issue Log: NLTK's punkt tokenizer requires additional data files
!pip install nltk rouge-score



In [None]:
import pandas as pd
import numpy as np
import google.generativeai as genai
import json
import time
import hnswlib
from google.generativeai import GenerativeModel
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from google.api_core import exceptions as google_exceptions
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, wait_random_exponential
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import nltk

## Configuration

Download NLTK data for BLEU, initialize ROUGE scorer.

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# nltk.download('punkt_tab')

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import google.generativeai as genai

GOOGLE_API_KEY = "AIzaSyDac912hYoC4czAPjFan_Ix6SgnVQbqWJA"
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')
embedding_model = genai.get_model('models/text-embedding-004')

## Pre-Processing

In [None]:
df = pd.read_csv('Customer_Support_Training_Dataset.csv')
df = df.rename(columns={"instruction": "prompt"})
df = df.dropna(subset=["prompt", "response"])
df = df.sample(n=200, random_state=42)

## Word Embedding & HNSW Indexing

An empty dictionary `cache` is declared to cache the result of text embedding, with the primary goal to avoid repeated calculation, optimizing the embedding process.

After resetting the maximum retrying attempt if request failed, the text embedding is performed, followed with HNSW (Hierarchical Navigable Small World, an ANN algorithm), which accelerates vector searching with indexing.

In [None]:
cached_embeddings = {}

@retry(stop=stop_after_attempt(3), wait=wait_random_exponential(min=1, max=5))
def get_embedding(text):
    if text not in cached_embeddings:
        # Generate & cache embedding vectors
        response = genai.embed_content(
            model='models/text-embedding-004',
            content=text,
            task_type='retrieval_document'
        )
        cached_embeddings[text] = np.array(response['embedding'], dtype=np.float32)
    return cached_embeddings[text]

embeddings = np.array([get_embedding(text) for text in tqdm(df['prompt'])], dtype=np.float32)

# Indexing
dim = embeddings.shape[1]
index = hnswlib.Index(space='cosine', dim=dim)
index.init_index(max_elements=1000, ef_construction=200, M=16)
index.add_items(embeddings, ids=np.arange(len(embeddings)))


  0%|          | 0/200 [00:00<?, ?it/s][A
  0%|          | 1/200 [00:00<01:13,  2.69it/s][A
  1%|          | 2/200 [00:00<01:07,  2.94it/s][A
  2%|▏         | 3/200 [00:01<01:05,  3.03it/s][A
  2%|▏         | 4/200 [00:01<01:03,  3.07it/s][A
  2%|▎         | 5/200 [00:01<01:03,  3.09it/s][A
  3%|▎         | 6/200 [00:01<01:02,  3.09it/s][A
  4%|▎         | 7/200 [00:02<01:06,  2.88it/s][A
  4%|▍         | 8/200 [00:02<01:04,  2.96it/s][A
  4%|▍         | 9/200 [00:03<01:35,  2.01it/s][A
  5%|▌         | 10/200 [00:04<01:33,  2.03it/s][A
  6%|▌         | 11/200 [00:04<01:42,  1.84it/s][A
  6%|▌         | 12/200 [00:05<01:38,  1.91it/s][A
  6%|▋         | 13/200 [00:05<01:35,  1.97it/s][A
  7%|▋         | 14/200 [00:06<01:32,  2.01it/s][A
  8%|▊         | 15/200 [00:06<01:30,  2.05it/s][A
  8%|▊         | 16/200 [00:07<01:27,  2.10it/s][A
  8%|▊         | 17/200 [00:07<01:26,  2.11it/s][A
  9%|▉         | 18/200 [00:07<01:25,  2.12it/s][A
 10%|▉         | 19/200 [00:0

## AI Service Implementation

### Helper Functions

In [None]:
def parse_json_response(text):
    return json.loads(text.strip("```json\n").strip("```"))

def fallback_text_response(text):
    return {"response": text, "confidence": "medium", "source": "direct"}

# Case: no matching content in the vector DB
def default_error_response():
    return {"response": "Please wait while transferring to a human agent.", "confidence": "low", "source": "fallback"}

# Case: quota exceeded (429 Resource Exhausted)
def handle_quota_error(error):
    print("Gemini drained. Wait for 5 minutes...")
    time.sleep(300)

# Case: generic error
def handle_general_error(error):
    return {"response": "Unexpected technical error encountered. Please try later.", "confidence": "low", "source": "error"}

### Implementation

In [None]:
examples = [
    {"q": "How do I reset my password?", "a": "To reset your password, go to the login page and click 'Forgot Password'."},
    {"q": "Where can I find my billing information?", "a": "Billing info is under 'My Account' > 'Billing'."},
    {"q": "Can I change my email address?", "a": "Yes, go to settings and update your email."},
]
example_text = "\n".join([f"User: {ex['q']}\nAgent: {ex['a']}" for ex in examples])

instruction = "Please answer in JSON format with fields 'response' and 'confidence'."

In [None]:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=2, min=10, max=300), retry=retry_if_exception_type(google_exceptions.ResourceExhausted))

def generate_answer(question, context):
    # Fill with prompt
    prompt = f"""
    **Customer Support Guidelines**

    Context:
    {context}

    Example Interactions:
    {example_text}

    Current Query:
    {question}

    {instruction}
    """
    try:
        response = model.generate_content(prompt)
        if not response.text:
            return default_error_response()
        try:
            return parse_json_response(response.text)
        except json.JSONDecodeError:
            return fallback_text_response(response.text)
    except google_exceptions.ResourceExhausted as e:
        handle_quota_error(e)
        raise
    except Exception as e:
      return handle_general_error(e)

## Metrics

In [None]:
def calculate_metrics(generated_text, true_answer):
    metrics = {}

    # Cosine Similarity - focus on semantic similarity
    gen_embedding = get_embedding(generated_text).reshape(1, -1)
    true_embedding = get_embedding(true_answer).reshape(1, -1)
    metrics['cosine_similarity'] = cosine_similarity(gen_embedding, true_embedding)[0][0]

    # BLEU (1-gram to 4-gram) - focus on literal matching
    metrics['bleu'] = sentence_bleu(
        [nltk.word_tokenize(true_answer)],
        nltk.word_tokenize(generated_text)
    )

    # ROUGE (1, 2, L) - Recall
    rouge_scores = rouge_scorer.score(true_answer, generated_text)
    for key in rouge_scores:
        metrics[f"rouge_{key}"] = rouge_scores[key].fmeasure

    return metrics

## Results

Workflow: User Input -> Vector Tranformation -> Search Similar Questions -> Generate Answers -> Metric Evaluation

In [None]:
def process_single_query(row):
    query = row['prompt']
    true_answer = row['response']
    query_vec = get_embedding(query).reshape(1, -1)

    indices, _ = index.knn_query(query_vec, k=3)
    context = "\n".join(df.iloc[idx]['response'] for idx in indices[0])

    gen_output = generate_answer(query, context)
    gen_text = gen_output.get('response', '')

    if not gen_text:
        return {
            "query": query,
            "generated": gen_output,
            "ground_truth": true_answer,
            "error": "empty_response"
        }

    metrics = calculate_metrics(gen_text, true_answer)

    return {
        "query": query,
        "generated": gen_output,
        "ground_truth": true_answer,
        "metrics": metrics,
        "confidence": gen_output.get('confidence', 'unknown')
    }

results = [process_single_query(row) for _, row in tqdm(df.iterrows(), total=len(df))]


  0%|          | 0/200 [00:00<?, ?it/s][A
  0%|          | 1/200 [00:01<06:01,  1.82s/it][A
  1%|          | 2/200 [00:03<05:57,  1.81s/it][A
  2%|▏         | 3/200 [00:05<05:47,  1.76s/it][A
  2%|▏         | 4/200 [00:06<05:22,  1.64s/it][A
  2%|▎         | 5/200 [00:08<05:26,  1.68s/it][A
  3%|▎         | 6/200 [00:10<05:16,  1.63s/it][A
  4%|▎         | 7/200 [00:11<05:07,  1.59s/it][A
  4%|▍         | 8/200 [00:13<05:19,  1.66s/it][A
  4%|▍         | 9/200 [00:15<05:28,  1.72s/it][A
  5%|▌         | 10/200 [00:16<05:26,  1.72s/it][A
  6%|▌         | 11/200 [00:19<05:47,  1.84s/it][A
  6%|▌         | 12/200 [00:21<06:17,  2.01s/it][A
  6%|▋         | 13/200 [00:23<06:11,  1.99s/it][A
  7%|▋         | 14/200 [00:25<06:09,  1.98s/it][A
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothes

## Analysis

In [None]:
def analyze_results(results):
    # Initialize trackers
    analysis = {
        'confidence_calibration': {'high': 0, 'medium': 0, 'low': 0},
        'error_types': {},
        'average_metrics': {
            'cosine_similarity': 0,
            'bleu': 0,
            'rouge_rouge1': 0,
            'rouge_rouge2': 0,
            'rouge_rougeL': 0
        },
        'threshold_counts': {
            'cosine_high': 0,  # >=0.7
            'cosine_medium': 0,  # 0.5-0.7
            'cosine_low': 0,  # <0.5
            'rougeL_high': 0,  # >=0.7
            'rougeL_low': 0   # <0.7
        }
    }

    valid_results = [r for r in results if 'metrics' in r]

    for r in valid_results:
        metrics = r['metrics']
        confidence = r['confidence']

        # Confidence calibration: case with low-quality answer but high labels
        if confidence == 'high':
            analysis['confidence_calibration']['high'] += 1
        elif confidence == 'medium':
            analysis['confidence_calibration']['medium'] += 1
        else:
            analysis['confidence_calibration']['low'] += 1

        for metric in analysis['average_metrics']:
            analysis['average_metrics'][metric] += metrics[metric]

        # Threshold counts for Cosine Similarity
        if metrics['cosine_similarity'] >= 0.7:
            analysis['threshold_counts']['cosine_high'] += 1
        elif metrics['cosine_similarity'] >= 0.5:
            analysis['threshold_counts']['cosine_medium'] += 1
        else:
            analysis['threshold_counts']['cosine_low'] += 1

        if metrics['rouge_rougeL'] >= 0.7:
            analysis['threshold_counts']['rougeL_high'] += 1
        else:
            analysis['threshold_counts']['rougeL_low'] += 1

    # Average Calculattion
    num_valid = len(valid_results)
    for metric in analysis['average_metrics']:
        analysis['average_metrics'][metric] /= num_valid if num_valid > 0 else 1

    # Error analysis
    error_results = [r for r in results if 'error' in r]
    for r in error_results:
        error_type = r['error']
        analysis['error_types'][error_type] = analysis['error_types'].get(error_type, 0) + 1

    return analysis

In [None]:
analysis = analyze_results(results)

print("\n=== Evaluation Report ===")
print(f"\n**Confidence Distribution:**")
print(f"High: {analysis['confidence_calibration']['high']}")
print(f"Medium: {analysis['confidence_calibration']['medium']}")
print(f"Low: {analysis['confidence_calibration']['low']}")

print("\n**Average Scores:**")
for metric, score in analysis['average_metrics'].items():
    print(f"{metric}: {score:.3f}")

print("\n**Threshold Counts:**")
print(f"Cosine High (>=0.7): {analysis['threshold_counts']['cosine_high']}")
print(f"Cosine Medium (0.5-0.7): {analysis['threshold_counts']['cosine_medium']}")
print(f"Cosine Low (<0.5): {analysis['threshold_counts']['cosine_low']}")
print(f"ROUGE-L High (>=0.7): {analysis['threshold_counts']['rougeL_high']}")
print(f"ROUGE-L Low (<0.7): {analysis['threshold_counts']['rougeL_low']}")

print("\n**Error Analysis:**")
for error, count in analysis['error_types'].items():
    print(f"{error}: {count}")


=== Evaluation Report ===

**Confidence Distribution:**
High: 0
Medium: 0
Low: 200

**Average Scores:**
cosine_similarity: 0.786
bleu: 0.048
rouge_rouge1: 0.170
rouge_rouge2: 0.085
rouge_rougeL: 0.133

**Threshold Counts:**
Cosine High (>=0.7): 135
Cosine Medium (0.5-0.7): 65
Cosine Low (<0.5): 0
ROUGE-L High (>=0.7): 1
ROUGE-L Low (<0.7): 199

**Error Analysis:**


## Custom Test

In [None]:
def test_custom_query(q):
    vec = get_embedding(q).reshape(1, -1)
    indices, _ = index.knn_query(vec, k=3)
    context = "\n".join(df.iloc[idx]['response'] for idx in indices[0])
    response = generate_answer(q, context)
    print(f"\nQuery: {q}\nResponse: {response['response']}\nConfidence: {response['confidence']}")

test_custom_query("Can I use ApplePay?")
test_custom_query("Can I cancel my order?")
test_custom_query("Can I update my mailing address?")


Query: Can I use ApplePay?
Response: Yes, we accept Apple Pay as a payment method.  However, availability may depend on your location and the specific product or service you are purchasing.  If you encounter any issues, please let me know.
Confidence: 0.95

Query: Can I cancel my order?
Response: Yes, you can cancel your order.  To do so, please follow these steps:

1. Log in to your {{Online Company Portal Info}} using your credentials.
2. Navigate to the '{{Online Order Interaction}}' section.
3. Locate the order with number {{Order Number}} and click on it.
4. Select the '{{Online Order Interaction}}' option to initiate the cancellation.
5. Follow any additional instructions provided by the system.

If you encounter any problems, please contact us during {{Customer Support Hours}} at {{Customer Support Phone Number}} or use the Live Chat feature on our {{Website URL}}.
Confidence: 0.95

Query: Can I update my mailing address?
Response: Yes, you can update your mailing address. You 