In [None]:
import pandas as pd
import subprocess
import re

df = pd.read_pickle("dataframe.pkl")

def process_row(idx, row):
    question = row["Question"]
    option_a = row["Option A"]
    option_b = row["Option B"]
    option_c = row["Option C"]
    correct_answer = row["Answer"].strip().upper()

    prompt = (
        "You are a helpful assistant that strictly follows instructions.\n\n"
        f"Question: {question}\n"
        f"Options:\n"
        f"A) {option_a}\n"
        f"B) {option_b}\n"
        f"C) {option_c}\n\n"
        "Choose the correct answer by responding with only one letter (A, B, or C). "
        "No explanation, just the letter.\n"
        "Example response: A"
    )

    try:
        result = subprocess.run(
            ["ollama", "run", "deepseek-r1:7b"],
            input=prompt.encode("utf-8"),
            capture_output=True,
            check=True,
            timeout=30
        )
        
        stdout = result.stdout.decode("utf-8").upper().strip()
        match = re.search(r'\b([ABC])\b', stdout)
        
        if not match:
            return (idx, "error", f"No valid answer in: '{stdout}'", correct_answer)
        
        model_response = match.group(1)
        
        if model_response not in ["A", "B", "C"]:
            return (idx, "error", f"Invalid response: '{model_response}'", correct_answer)
            
        is_correct = model_response == correct_answer
        return (idx, is_correct, None, correct_answer)

    except subprocess.CalledProcessError as e:
        return (idx, "error", f"Process failed: {e.stderr.decode('utf-8').strip()}", correct_answer)
    except Exception as e:
        return (idx, "error", f"Runtime error: {str(e)}", correct_answer)

# Initialize tracking variables
predicted_results = [None] * len(df)
total_valid = 0
correct_answers = 0
errors = []
total_rows = len(df)

print(f"Starting sequential processing of {total_rows} rows...\n")

for i, (_, row) in enumerate(df.iterrows()):
    # Process current row
    idx, result, error, correct = process_row(i, row)
    
    # Update progress
    progress = (i + 1) / total_rows * 100
    print(f"Processed {i+1}/{total_rows} ({progress:.2f}%) - ", end="")
    
    # Handle results
    if error:
        errors.append(f"Row {idx}: {error}")
        predicted_results[idx] = "error"
        print(f"Error: {error}")
    else:
        predicted_results[idx] = result
        total_valid += 1
        if result:
            correct_answers += 1
        print(f"Result: {'Correct' if result else 'Incorrect'}")
# Calculate final statistics
accuracy = (correct_answers / total_valid * 100) if total_valid > 0 else 0

print("\n\nFinal Results:")
print(f"Valid questions: {total_valid}/{total_rows}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"Errors encountered: {len(errors)}")

if errors:
    print("\nFirst 5 errors:")
    for error in errors[:5]:
        print(error)

Deepseek 1.5B

7.5 seconds timeout

Final Results:
Valid questions: 43/200
Accuracy: 51.16%
Errors encountered: 157

15 seconds timeout

Final Results:
Valid questions: 142/200
Accuracy: 38.73%
Errors encountered: 58

30 seconds timeout

Final Results:
Valid questions: 132/200
Accuracy: 40.15%
Errors encountered: 68

60 seconds timeout

Final Results:
Valid questions: 181/200
Accuracy: 40.33%
Errors encountered: 19

120 seconds until timeout 

Final Results:
Valid questions: 192/200
Accuracy: 33.33%
Errors encountered: 8

Lokal 7B:

Final Results:
Valid questions: 27/100
Accuracy: 48.15%
Errors encountered: 73

Final Results:
Valid questions: 166/200
Accuracy: 40.96%
Errors encountered: 34
