In [None]:
import os
import json
import time
import socket
import subprocess
import requests
from tqdm import tqdm  

In [None]:
# Host and port for Ollama HTTP API

HOST = "127.0.0.1"
PORT = 11434  

# File paths

INPUT_FILE = 'control_results.json'
OUTPUT_FILE = 'control_evaluations_llama.json'

# Enable GPU inference if available
os.environ['OLLAMA_CUDA'] = '1'


# Daemon management functions
def ollama_running():
    """Check if the Ollama daemon is listening on the configured port."""
    try:
        with socket.create_connection((HOST, PORT), timeout=1):
            return True
    except OSError:
        return False


def kill_ollama_processes(port=PORT):
    """Terminate all processes owning the given TCP port (Windows PowerShell)."""
    ps_cmd = [
        "powershell", "-Command",
        (
            f"Get-NetTCPConnection -LocalPort {port} |"
            " Select-Object -ExpandProperty OwningProcess |"
            " ForEach-Object { Stop-Process -Id $_ -Force }"
        )
    ]
    subprocess.run(ps_cmd, check=False)


def start_daemon():
    """Kill any leftover listeners and start ollama serve in the background."""
    kill_ollama_processes()
    return subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )


# Ensure the Ollama daemon is running
if not ollama_running():
    serve_proc = start_daemon()
    print(f"Starting Ollama daemon (PID {serve_proc.pid})...")
    time.sleep(5)  # Allow the daemon to become ready
else:
    print("Ollama daemon already running.")


# Load data
with open(INPUT_FILE, 'r') as f:
    data = json.load(f)
print(f"Loaded {len(data)} items from {INPUT_FILE}")

results = []

total_items = len(data)


def evaluate_item(item, index, total):
    """Send a prompt to Gemma3:12b via HTTP and return a YES/NO evaluation."""
    print(f"Evaluating item {index+1}/{total}: {item['question'][:60]}...")

    # Construct the evaluation prompt
    prompt = (
        "Your task is to evaluate the outputs of another model on a QA task. "
        "Structure is as follows: question, expected answer, model answer.\n"
        f"Question: {item['question']}\n"
        f"Expected answer: {item['expected_answer']}\n"
        f"Model answer: {item['model_answer']}\n"
        "Reply in a single word: YES/NO?"
    )

    payload = {
        "model": "gemma3:27b",
        "prompt": prompt,
        "stream": False,
        "temperature": 0.0
    }

    try:
        resp = requests.post(
            f"http://{HOST}:{PORT}/api/generate",
            json=payload,
            timeout=120
        )
        resp.raise_for_status()
        result_json = resp.json()

        
        if 'response' in result_json:
            text = result_json['response']
        else:
            text = result_json['choices'][0]['message']['content']

        evaluation = text.strip().split()[0]
    except Exception as e:
        print(f"Error on item {index+1}: {e}")
        evaluation = "ERROR"

    print(f"→ Got evaluation: {evaluation}\n")
    return evaluation


# Evaluations
for idx, item in enumerate(tqdm(data, desc="Evaluating QA pairs", unit="pair")):
    eval_result = evaluate_item(item, idx, total_items)
    results.append({
        'question': item['question'],
        'evaluation': eval_result
    })

# Save results and print accuracy
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Wrote {len(results)} evaluation results to {OUTPUT_FILE}")

num_yes = sum(1 for r in results if r['evaluation'].upper() == 'YES')
accuracy = (num_yes / len(results)) * 100 if results else 0
print(f"Accuracy: {accuracy:.2f}%")
