In [11]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics import ContextRelevance

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
import psycopg2
from psycopg2.extras import RealDictCursor

import logging, sys, nest_asyncio
nest_asyncio.apply()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ragas")
logger.setLevel(logging.INFO)
if not logger.handlers:
    logger.addHandler(logging.StreamHandler(sys.stdout))

# Set OpenAI API Key (ensure it's in your environment or set it here)
# os.environ["OPENAI_API_KEY"] = "sk-..."

# Database connection string
raw_db_url = os.environ.get("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/rag_viz")
if "?" in raw_db_url:
    DB_URL = raw_db_url.split("?")[0]
else:
    DB_URL = raw_db_url

print("Libraries imported and logging configured.")

Libraries imported and logging configured.


In [12]:
# Connect to Database and Fetch Data
try:
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor(cursor_factory=RealDictCursor)
    
    # Fetch Answers with Questions and Retrievals from JSON field
    query = """
        SELECT 
            a.id as answer_id,
            q.text as question,
            a.text as answer,
            a.retrievals,
            a."llmScore"
        FROM "Answer" a
        JOIN "Question" q ON a."questionId" = q.id
        WHERE a.retrievals IS NOT NULL AND jsonb_array_length(a.retrievals::jsonb) > 0
    """
    
    cur.execute(query)
    rows = cur.fetchall()
    
    # Convert to DataFrame
    df = pd.DataFrame(rows)
    
    print(f"Fetched {len(df)} records with retrievals.")
    
    if len(df) == 0:
        print("WARNING: No records found with retrievals. Make sure populate_retrievals.ts has been run.")
        cur.close()
        conn.close()
    else:
        # Extract contexts from the retrievals JSON
        # Each retrieval has: {id, score, text, documentTitle, index}
        # Ragas expects 'contexts' to be a list of strings (the text field)
        df['contexts'] = df['retrievals'].apply(
            lambda x: [r['text'] for r in x] if x and isinstance(x, list) else []
        )
        
        # Select only the columns needed for RAGAS
        # Ragas standard columns: question, answer, contexts
        df = df[['answer_id', 'question', 'answer', 'contexts', 'llmScore']]
        
        print(f"Sample record:")
        print(f"  Question: {df.iloc[0]['question'][:100]}...")
        print(f"  Answer: {df.iloc[0]['answer'][:100]}...")
        print(f"  Contexts: {len(df.iloc[0]['contexts'])} chunks")
        print(f"  Current llmScore: {df.iloc[0]['llmScore']}")
        
        cur.close()
        conn.close()

except Exception as e:
    print(f"Database error: {e}")
    import traceback
    traceback.print_exc()

Database error: connection to server at "127.0.0.1", port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?



Traceback (most recent call last):
  File "C:\Users\Computer\AppData\Local\Temp\ipykernel_20568\3712908999.py", line 3, in <module>
    conn = psycopg2.connect(DB_URL)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\projects\547-vis-project\.venv312\Lib\site-packages\psycopg2\__init__.py", line 135, in connect
    conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
psycopg2.OperationalError: connection to server at "127.0.0.1", port 5432 failed: Connection refused (0x0000274D/10061)
	Is the server running on that host and accepting TCP/IP connections?



In [13]:
# Configure RAGAS with Ollama (LLM Judge) and OpenAI Embeddings
# Adds OpenAI fallback (gpt-4o-mini) if Ollama model fails or env overrides.
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    LLMContextPrecisionWithoutReference,
)

# Preferred order of local fallback models (Ollama)
CANDIDATE_MODELS = ["qwen3:8b"]
MODEL_NAME = CANDIDATE_MODELS[0]

USE_OPENAI_JUDGE = os.getenv("USE_OPENAI_JUDGE", "0") == "1"  # force OpenAI judge
AUTO_FALLBACK_OPENAI = os.getenv("AUTO_FALLBACK_OPENAI", "1") == "1"  # retry with OpenAI on failure
OPENAI_JUDGE_MODEL = os.getenv("OPENAI_JUDGE_MODEL", "gpt-4o-mini")

# Build Ollama-based judge
def build_ollama_llm(model_name: str):
    print(f"Initializing Ollama model: {model_name}")
    m = ChatOllama(model=model_name, temperature=0)
    wrapped = LangchainLLMWrapper(m)
    try:
        probe = m.invoke("Health check.")
        print(f"Model '{model_name}' responded: {probe.content[:80]}...")
    except Exception as e:
        print(f"Model '{model_name}' failed probe: {e}")
    return m, wrapped

ollama_model, ollama_llm_wrapper = build_ollama_llm(MODEL_NAME)

# Build OpenAI judge (always available if key set) for fallback or forced usage
try:
    openai_judge = ChatOpenAI(model=OPENAI_JUDGE_MODEL, temperature=0)
    openai_judge_wrapper = LangchainLLMWrapper(openai_judge)
    print(f"Prepared OpenAI judge model: {OPENAI_JUDGE_MODEL}")
except Exception as e:
    openai_judge = None
    openai_judge_wrapper = None
    print(f"‚ö†Ô∏è Could not initialize OpenAI judge: {e}")

# Choose primary llm wrapper
if USE_OPENAI_JUDGE and openai_judge_wrapper is not None:
    llm = openai_judge_wrapper
    ACTIVE_JUDGE = f"openai:{OPENAI_JUDGE_MODEL} (forced)"
else:
    llm = ollama_llm_wrapper
    ACTIVE_JUDGE = f"ollama:{MODEL_NAME}"

# Embeddings (ensure OPENAI_API_KEY set)
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings = LangchainEmbeddingsWrapper(openai_embeddings)

# Metrics
context_precision_no_ref = LLMContextPrecisionWithoutReference()
metrics = [faithfulness, answer_relevancy, context_precision_no_ref]
for metric in metrics:
    if hasattr(metric, "llm"):
        metric.llm = llm
    if hasattr(metric, "embeddings"):
        metric.embeddings = embeddings

print(f"RAGAS metrics configured using judge: {ACTIVE_JUDGE}")
if not USE_OPENAI_JUDGE and openai_judge_wrapper is not None:
    print("OpenAI judge fallback enabled (AUTO_FALLBACK_OPENAI=1).")


Initializing Ollama model: qwen3:8b


  wrapped = LangchainLLMWrapper(m)
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Model 'qwen3:8b' responded: To provide a helpful health check, could you clarify which area you're referring...
Prepared OpenAI judge model: gpt-4o-mini


  openai_judge_wrapper = LangchainLLMWrapper(openai_judge)


RAGAS metrics configured using judge: ollama:qwen3:8b
OpenAI judge fallback enabled (AUTO_FALLBACK_OPENAI=1).


  embeddings = LangchainEmbeddingsWrapper(openai_embeddings)


In [None]:
# Run Evaluation - Sequential Mode (Simplified)
# -----------------------------------------------------------------------------
# Runs evaluation one row at a time to avoid rate limits and ensure progress is saved.
# -----------------------------------------------------------------------------
import os
import time
import pandas as pd
from datasets import Dataset
from ragas import evaluate as ragas_evaluate

# Configuration
EVAL_SAMPLE_LIMIT = int(os.getenv("RAGAS_SAMPLE_LIMIT", "0"))          # 0 => use full dataset
MAX_CONTEXTS_PER_ANSWER = int(os.getenv("RAGAS_MAX_CONTEXTS", "8"))    # limit contexts per answer
CONTEXT_TRUNC_CHARS = int(os.getenv("RAGAS_CONTEXT_TRUNC", "800"))     # truncate each context to N chars
TIMEOUT_SECONDS = int(os.getenv("RAGAS_TIMEOUT", "180"))               # timeout per row
MAX_RETRIES = int(os.getenv("RAGAS_MAX_RETRIES", "3"))
AUTO_FALLBACK_OPENAI = os.getenv("AUTO_FALLBACK_OPENAI", "1") == "1"

print("Evaluation configuration (Sequential):")
print(f"  SAMPLE_LIMIT: {EVAL_SAMPLE_LIMIT if EVAL_SAMPLE_LIMIT>0 else 'FULL'}")
print(f"  TIMEOUT_SECONDS: {TIMEOUT_SECONDS}")

# Guard: ensure df exists
if 'df' not in globals():
    raise RuntimeError("DataFrame 'df' not found. Please run the 'Connect to Database and Fetch Data' cell above first.")

# Prepare Data
if EVAL_SAMPLE_LIMIT > 0 and EVAL_SAMPLE_LIMIT < len(df):
    working_df = df.sample(n=EVAL_SAMPLE_LIMIT, random_state=42).reset_index(drop=True)
else:
    working_df = df.copy().reset_index(drop=True)

print(f"Processing {len(working_df)} rows...")

# Sanitization
sanitized_records = []
for _, r in working_df.iterrows():
    q = r.get("question", "") or ""
    a = r.get("answer", "") or ""
    ctxs = r.get("contexts", []) or []
    # Limit contexts and truncate
    ctxs_clean = [c[:CONTEXT_TRUNC_CHARS] for c in ctxs[:MAX_CONTEXTS_PER_ANSWER] if isinstance(c, str) and c.strip()]
    sanitized_records.append({"question": q, "answer": a, "contexts": ctxs_clean})

sanitized_df = pd.DataFrame(sanitized_records)

# Run Config
try:
    from ragas.run_config import RunConfig
    run_config = RunConfig(timeout=TIMEOUT_SECONDS, max_retries=MAX_RETRIES)
except ImportError:
    run_config = None
    print("‚ö†Ô∏è RunConfig not available")

# Sequential Evaluation Loop
results_list = []
print("\nüîÑ Starting Sequential Evaluation...")

_t0 = time.time()

for idx, row in sanitized_df.iterrows():
    print(f"Evaluating row {idx+1}/{len(sanitized_df)}...", end=" ", flush=True)
    
    # Create single-row dataset
    single_ds = Dataset.from_pandas(pd.DataFrame([row]))
    
    try:
        # Try primary judge
        result = ragas_evaluate(
            dataset=single_ds,
            metrics=metrics,
            llm=llm,
            embeddings=embeddings,
            raise_exceptions=True,
            run_config=run_config,
        )
        print("‚úÖ Done")
    except Exception as e:
        print(f"‚ùå Failed: {e}")
        # Fallback logic
        if AUTO_FALLBACK_OPENAI and 'openai_judge_wrapper' in globals() and openai_judge_wrapper is not None:
            print("   Attempting fallback to OpenAI...", end=" ", flush=True)
            try:
                result = ragas_evaluate(
                    dataset=single_ds,
                    metrics=metrics,
                    llm=openai_judge_wrapper,
                    embeddings=embeddings,
                    raise_exceptions=True,
                    run_config=run_config,
                )
                print("‚úÖ Recovered")
            except Exception as e2:
                print(f"‚ùå Fallback failed: {e2}")
                result = None
        else:
            result = None

    if result:
        res_df = result.to_pandas()
        # Add back original ID
        res_df['answer_id'] = working_df.iloc[idx]['answer_id']
        results_list.append(res_df)
    else:
        # Add a row with NaN metrics if failed, to keep track
        failed_row = row.to_dict()
        failed_row['answer_id'] = working_df.iloc[idx]['answer_id']
        results_list.append(pd.DataFrame([failed_row]))

if results_list:
    results_df = pd.concat(results_list, ignore_index=True)
    
    # Calculate aggregate score
    metric_cols = [m.name for m in metrics]
    # Filter to only columns that exist in results_df
    existing_metric_cols = [c for c in metric_cols if c in results_df.columns]
    
    if existing_metric_cols:
        results_df['llm_score_computed'] = results_df[existing_metric_cols].mean(axis=1, skipna=True)
    
    print(f"\nCompleted in {time.time() - _t0:.1f}s")
    print(f"Results shape: {results_df.shape}")
else:
    print("No results generated.")
    results_df = pd.DataFrame()

Evaluation configuration:
  SAMPLE_LIMIT: FULL
  MAX_CONTEXTS_PER_ANSWER: 8
  CONTEXT_TRUNC_CHARS: 800
  TIMEOUT_SECONDS: 180
  MAX_RETRIES: 3
  BATCH_SIZE: 4
  AUTO_FALLBACK_OPENAI: True
  SEQUENTIAL_MODE: False
  PROGRESS_INTERVAL: 10


RuntimeError: DataFrame 'df' not found. Run the data fetch cell first.

In [None]:
# Export per-answer metric details to JSON for future analysis
import json, os, time

if 'results_df' in globals():
    metrics_export_cols = [
        'answer_id', 'question', 'answer',
        'faithfulness', 'answer_relevancy', 'llm_context_precision_without_reference',
        'llm_score_computed'
    ]
    missing = [c for c in metrics_export_cols if c not in results_df.columns]
    if missing:
        print(f"‚ö†Ô∏è Missing expected columns, cannot export all metrics: {missing}")
    export_cols = [c for c in metrics_export_cols if c in results_df.columns]
    export_records = results_df[export_cols].to_dict(orient='records')
    out_dir = os.path.join(os.getcwd(), 'data')
    os.makedirs(out_dir, exist_ok=True)
    ts = time.strftime('%Y%m%d_%H%M%S')
    out_path = os.path.join(out_dir, f'ragas_metrics_{len(results_df)}_{ts}.json')
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(export_records, f, ensure_ascii=False, indent=2)
    print(f"‚úÖ Saved metrics JSON: {out_path}")
else:
    print("‚ö†Ô∏è results_df not found; run evaluation cell first.")

In [None]:
# Write LLM Scores Back to Database
# Update the Answer table with computed LLM scores

if 'llm_score_computed' in results_df.columns:
    try:
        conn = psycopg2.connect(DB_URL)
        cur = conn.cursor()
        
        updated_count = 0
        skipped_count = 0
        
        print("="*80)
        print("UPDATING DATABASE WITH LLM SCORES:")
        print("="*80 + "\n")
        
        for idx, row in results_df.iterrows():
            answer_id = row['answer_id']
            llm_score = row['llm_score_computed']
            
            # Skip if score is NaN (evaluation failed)
            if pd.isna(llm_score):
                print(f"‚ö†Ô∏è  Skipping {answer_id}: LLM score is NaN (evaluation failed)")
                skipped_count += 1
                continue
            
            # Update the llmScore field in the Answer table
            update_query = """
                UPDATE "Answer"
                SET "llmScore" = %s
                WHERE id = %s
            """
            cur.execute(update_query, (float(llm_score), answer_id))
            print(f"‚úÖ Updated {answer_id}: llmScore = {llm_score:.3f}")
            updated_count += 1
        
        conn.commit()
        
        print(f"\n{'='*80}")
        print(f"‚úÖ Successfully updated {updated_count} Answer records with LLM scores")
        if skipped_count > 0:
            print(f"‚ö†Ô∏è  Skipped {skipped_count} records due to NaN scores")
        print(f"{'='*80}")
        
        cur.close()
        conn.close()
        
    except Exception as e:
        print(f"\n‚ùå Error updating database: {e}")
        import traceback
        traceback.print_exc()
        if 'conn' in locals():
            conn.rollback()
else:
    print("‚ö†Ô∏è No llm_score_computed column found. Skipping database update.")