In [None]:
import pandas as pd
import concurrent.futures
from openai import OpenAI

client = OpenAI(api_key="xxx", base_url="https://api.deepseek.com/v1")
df = pd.read_pickle("dataframe_perturbated_maths.pkl")  # or "dataframe_perturbated.pkl"

# Function to process a single row
def process_row(idx, row):
    question = row["Question"]
    option_a = row["Option A"]
    option_b = row["Option B"]
    option_c = row["Option C"]
    correct_answer = row["Answer"].strip().upper()

    prompt = (
        f"Question: {question}\n"
        f"Options:\n"
        f"A) {option_a}\n"
        f"B) {option_b}\n"
        f"C) {option_c}\n\n"
        "Choose the correct answer by responding with only one letter (A, B, or C). No explanation, just the letter.\n"
        "Example response: A"
    )

    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that strictly follows instructions."},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )
        model_response = response.choices[0].message.content.strip().upper()
        # Validate response format
        if model_response not in ["A", "B", "C"]:
            # Return None to indicate an error occurred (unexpected format)
            return (idx, "error", f"Unexpected answer format: '{model_response}'", correct_answer)
        else:
            is_correct = model_response == correct_answer
            return (idx, is_correct, None, correct_answer)
    except Exception as e:
        return (idx, "error", f"API call error: {str(e)}", correct_answer)

# Pre-allocate a list for all datapoints with a placeholder value
predicted_results = [None] * len(df)

# Variables for valid response tracking (only those without error)
total_valid_questions = 0  
correct_answers = 0
errors = []

# Use sequential indices (0, 1, 2, ...) to fill the predicted_results list in the original order.
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    # Enumerate rows so we have a sequential index for each
    futures = {executor.submit(process_row, i, row): i for i, (_, row) in enumerate(df.iterrows())}
    
    # Process results as they complete
    for future in concurrent.futures.as_completed(futures):
        i, result, error, correct_answer = future.result()
        if error:
            errors.append(f"Row {i}: {error}")
            # Mark errors explicitly as "error" in the list
            predicted_results[i] = "error"
            continue
        # Update the predicted_results list for this datapoint
        predicted_results[i] = result
        total_valid_questions += 1
        if result is True:
            correct_answers += 1

# Calculate accuracy based only on valid (non-error) responses
accuracy = (correct_answers / total_valid_questions * 100) if total_valid_questions > 0 else 0

# Print results
print(f"Total valid questions processed: {total_valid_questions}")
print(f"Correct answers: {correct_answers}")
print(f"Accuracy (excluding errors): {accuracy:.2f}%")
print("\nPrediction results (list):")
print(predicted_results)
if errors:
    print("\nErrors encountered:")
    for error in errors:
        print(error)


Deepseek chat:

- normal datenset:

Total questions processed: 1500
Correct answers: 851
Accuracy: 56.73%

- perturbated dataset:

Total questions processed: 1500
Correct answers: 806
Accuracy: 53.73%

- perturbated math set
Total valid questions processed: 998
Correct answers: 528
Accuracy (excluding errors): 52.91%

Deepseek reasoning:

- normalem datenset:

Total questions processed: 1500
Correct answers: 1484
Accuracy: 98.93%

- perturbated dataset:

Total questions processed: 1500
Correct answers: 1454
Accuracy: 96.93%

- perturbated math set

Total valid questions processed: 519
Correct answers: 506
Accuracy (excluding errors): 97.50%

GPT-4 o1

Total valid questions processed: 204
Correct answers: 204
Accuracy (excluding errors): 100.00%

GPT-3.5 Turbo
Total valid questions processed: 199
Correct answers: 43
Accuracy (excluding errors): 21.61%



# Lokal chillen

In [27]:
import pandas as pd
import subprocess
import logging
import time

# Configure logging to output timestamps and log level
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

df = pd.read_pickle("dataframe_perturbated.pkl")  # or "dataframe_perturbated.pkl"

# Function to process a single row using the locally hosted deepseek model
def process_row(idx, row):
    question = row["Question"]
    option_a = row["Option A"]
    option_b = row["Option B"]
    option_c = row["Option C"]
    correct_answer = row["Answer"].strip().upper()

    prompt = (
        f"Question: {question}\n"
        f"Options:\n"
        f"A) {option_a}\n"
        f"B) {option_b}\n"
        f"C) {option_c}\n\n"
        "Choose the correct answer by responding with only one letter (A, B, or C). No explanation, just the letter.\n"
        "Example response: A"
    )
    
    start_time = time.time()
    try:
        # Run the deepseek model locally using the ollama CLI command.
        result = subprocess.run(
            ["ollama", "run", "deepseek-r1:7b"],
            input=prompt.encode("utf-8"),
            capture_output=True,
            check=True
        )
        model_response = result.stdout.decode("utf-8").strip().upper()
        elapsed = time.time() - start_time
        logging.info(f"Row {idx} processed in {elapsed:.2f} seconds.")

        # Validate response format
        if model_response not in ["A", "B", "C"]:
            return (idx, False, f"Unexpected answer format: '{model_response}'", correct_answer)
        else:
            is_correct = model_response == correct_answer
            return (idx, is_correct, None, correct_answer)
    except Exception as e:
        elapsed = time.time() - start_time
        logging.error(f"Row {idx} encountered an error in {elapsed:.2f} seconds: {str(e)}")
        return (idx, False, f"Local API call error: {str(e)}", correct_answer)

# Initialize counters and error list
total_questions = 0
correct_answers = 0
errors = []

logging.info("Starting sequential processing of questions...")
# Process each row sequentially
for idx, row in df.iterrows():
    idx, is_correct, error, correct_answer = process_row(idx, row)
    total_questions += 1
    if is_correct:
        correct_answers += 1
    if error:
        errors.append(f"Row {idx}: {error}")

# Calculate accuracy
accuracy = (correct_answers / total_questions * 100) if total_questions > 0 else 0

logging.info(f"Total questions processed: {total_questions}")
logging.info(f"Correct answers: {correct_answers}")
logging.info(f"Accuracy: {accuracy:.2f}%")

if errors:
    logging.error("Errors encountered during processing:")
    for error in errors:
        logging.error(error)


2025-03-30 08:09:33 [INFO] Starting sequential processing of questions...
2025-03-30 08:10:04 [INFO] Row 0 processed in 30.79 seconds.
2025-03-30 08:11:00 [INFO] Row 1 processed in 56.63 seconds.


KeyboardInterrupt: 