In [1]:
!pip install datasets evaluate



In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import random
import numpy as np
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Set random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

# Define model parameters
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
MAX_NEW_TOKENS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
print(f"Loading model {MODEL_ID} on {DEVICE}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    device_map="auto"
)

# Load IMDB dataset
print("Loading IMDB dataset...")
imdb = load_dataset("imdb")

Loading model meta-llama/Llama-3.2-1B-Instruct on cuda...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading IMDB dataset...


In [5]:
# Prepare shots for few-shot prompting
def prepare_examples(dataset, num_examples=5):
    """Prepare examples for few-shot prompting."""
    # Get balanced examples (equal number of positive and negative)
    pos_examples = [item for item in dataset if item["label"] == 1][:num_examples//2]
    neg_examples = [item for item in dataset if item["label"] == 0][:num_examples//2]

    # If num_examples is odd, we add one more positive example
    if num_examples % 2 == 1 and num_examples > 0:
        pos_examples.append(dataset[dataset.index(random.choice([item for item in dataset if item["label"] == 1]))])

    examples = pos_examples + neg_examples
    random.shuffle(examples)

    return examples

# Format examples for few-shot prompting
def format_few_shot_examples(examples):
    formatted_examples = ""
    for example in examples:
        sentiment = "positive" if example["label"] == 1 else "negative"
        formatted_examples += f"Review: {example['text']}\nSentiment: {sentiment}\n\n"
    return formatted_examples

# Create prompts for different numbers of shots
def create_prompt(text, examples, shot_count):
    system_prompt = (
        "You are a sentiment analysis expert. Given a movie review, classify the sentiment as either positive or negative. "
        "Only output 'positive' or 'negative' without any other text or explanation."
    )

    if shot_count == 0:
        prompt = f"{system_prompt}\n\nReview: {text}\nSentiment:"
    else:
        few_shot_examples = format_few_shot_examples(examples[:shot_count])
        prompt = f"{system_prompt}\n\n{few_shot_examples}Review: {text}\nSentiment:"

    return prompt

# Function for inference
def predict_sentiment(prompt, text, examples, shot_count, max_retries=2):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    retries = 0
    while retries <= max_retries:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                temperature=0.1,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = full_response[len(prompt):]

        # Clean up the prediction - strip whitespace and lowercase
        prediction = prediction.lower().strip()

        # Check if prediction starts with either "positive" or "negative"
        if prediction.startswith("positive"):
            return "positive", 1, retries
        elif prediction.startswith("negative"):
            return "negative", 0, retries

        # Try again with partial matches at the start
        first_word = prediction.split()[0] if prediction.split() else ""
        if first_word == "positive" or first_word.startswith("pos"):
            return "positive", 1, retries
        elif first_word == "negative" or first_word.startswith("neg"):
            return "negative", 0, retries

        # If this is our last retry, make a simple default choice
        if retries == max_retries:
            # Return the raw prediction and default to the opposite of the true label
            return prediction, -1, retries

        # Otherwise, create a stronger prompt and retry
        retries += 1

        # Create a stronger prompt - be very explicit about the first word
        system_prompt = (
            "You are a sentiment analysis expert. Given a movie review, classify the sentiment as either positive or negative. "
            "CRITICAL INSTRUCTION: Your response MUST START with EXACTLY the word 'positive' or the word 'negative'. "
            "The first word of your response must be either 'positive' or 'negative', representing the answer."
        )

        if shot_count == 0:
            prompt = f"{system_prompt}\n\nReview: {text}\nSentiment:"
        else:
            few_shot_examples = format_few_shot_examples(examples[:shot_count])
            prompt = f"{system_prompt}\n\n{few_shot_examples}Review: {text}\nSentiment:"

        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

In [6]:
# Function to evaluate model
def evaluate_model(dataset, shots, num_samples=200):
    results = {
        "shots": shots,
        "accuracy": [],
        "f1_score": [],
        "positive_precision": [],
        "negative_precision": [],
        "positive_recall": [],
        "negative_recall": [],
        "avg_retries": [],
        # Add these new metrics
        "corrected_accuracy": [],
        "corrected_f1_score": []
    }

    # Sample a subset from the test set for evaluation
    print(f"Sampling {num_samples} examples from the test set...")
    test_sample = random.sample(list(dataset), num_samples)

    # Get examples from training set for few-shot prompting
    train_examples = prepare_examples(list(imdb["train"]), max(shots))
    print(f"Selected {max(shots)} examples from training set for few-shot prompting")

    # Save predictions for analysis
    all_predictions = []
    all_labels = []

    # Track retry statistics
    retry_stats = []

    for shot_count in shots:
        print(f"\nEvaluating with {shot_count}-shot prompting...")
        predictions = []
        true_labels = []
        # Lists to track corrected predictions (ignoring max retries)
        corrected_predictions = []
        corrected_true_labels = []

        for item in tqdm(test_sample):
            prompt = create_prompt(item["text"], train_examples, shot_count)
            prediction, pred_label, retry_count = predict_sentiment(prompt, item["text"], train_examples, shot_count)

            # Track retry statistics
            if retry_count > 0:
                retry_stats.append({
                    "shots": shot_count,
                    "retries": retry_count,
                    "text_preview": item["text"][:100] + "...",
                    "final_prediction": prediction
                })

            # For standard metrics (same as before)
            # If model still couldn't give a clear answer after retries, count it as wrong
            if pred_label == -1:
                # Assign a label opposite to the true label to ensure it's counted as wrong
                pred_label = 1 - item["label"]
                predictions.append(pred_label)
                true_labels.append(item["label"])
            else:
                predictions.append(pred_label)
                true_labels.append(item["label"])

                # For corrected metrics, only include if we got a valid prediction
                # (i.e., didn't need too many retries)
                corrected_predictions.append(pred_label)
                corrected_true_labels.append(item["label"])

            # Store for later analysis
            all_predictions.append({
                "shots": shot_count,
                "text": item["text"][:100] + "...",
                "true_label": "positive" if item["label"] == 1 else "negative",
                "predicted": prediction,
                "correct": pred_label == item["label"],
                "retries": retry_count,
                "exceeded_max_retries": pred_label == -1
            })

        # Calculate standard metrics
        accuracy = accuracy_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions, average='weighted')
        report = classification_report(true_labels, predictions, target_names=["negative", "positive"], output_dict=True)

        # Calculate corrected metrics
        corrected_accuracy = accuracy_score(corrected_true_labels, corrected_predictions) if corrected_true_labels else 0
        corrected_f1 = f1_score(corrected_true_labels, corrected_predictions, average='weighted') if corrected_true_labels else 0

        # Calculate average retries for this shot count
        retry_counts_for_shot = [r["retries"] for r in retry_stats if r["shots"] == shot_count]
        avg_retries = sum(retry_counts_for_shot) / len(test_sample) if retry_counts_for_shot else 0

        # Store all results
        results["accuracy"].append(accuracy)
        results["f1_score"].append(f1)
        results["positive_precision"].append(report["positive"]["precision"])
        results["negative_precision"].append(report["negative"]["precision"])
        results["positive_recall"].append(report["positive"]["recall"])
        results["negative_recall"].append(report["negative"]["recall"])
        results["avg_retries"].append(avg_retries)

        # Store corrected metrics
        results["corrected_accuracy"].append(corrected_accuracy)
        results["corrected_f1_score"].append(corrected_f1)

        # Include the corrected metrics in the output
        print(f"Accuracy with {shot_count}-shot: {accuracy:.4f}, F1: {f1:.4f} (avg retries: {avg_retries:.2f})")
        print(f"Corrected metrics (ignoring max retries): Accuracy: {corrected_accuracy:.4f}, F1: {corrected_f1:.4f}")
        print(f"Valid samples: {len(corrected_true_labels)}/{len(test_sample)} ({len(corrected_true_labels)/len(test_sample)*100:.1f}%)")

    # Save predictions for analysis
    pd.DataFrame(all_predictions).to_csv("sentiment_predictions.csv", index=False)

    # Save retry statistics
    if retry_stats:
        pd.DataFrame(retry_stats).to_csv("retry_statistics.csv", index=False)
        print(f"Total retries needed: {len(retry_stats)}")

    return results

# Define shot counts to evaluate
shots = [0, 1, 3, 5, 10, 20]

In [7]:
num_samples = 1000

In [8]:
# Run evaluation using train set for examples and test set for evaluation
print("Getting few-shot examples from training set...")
# Run evaluation
results = evaluate_model(imdb["test"], shots, num_samples)

# Create results dataframe
results_df = pd.DataFrame({
    "Shots": results["shots"],
    "Accuracy": results["accuracy"],
    "Corrected Accuracy": results["corrected_accuracy"],
    "F1 Score": results["f1_score"],
    "Corrected F1 Score": results["corrected_f1_score"],
    "Avg Retries": results["avg_retries"],
    "Positive Precision": results["positive_precision"],
    "Negative Precision": results["negative_precision"],
    "Positive Recall": results["positive_recall"],
    "Negative Recall": results["negative_recall"]
})

# Save results
results_df.to_csv("sentiment_analysis_results.csv", index=False)

Getting few-shot examples from training set...
Sampling 1000 examples from the test set...
Selected 20 examples from training set for few-shot prompting

Evaluating with 0-shot prompting...


100%|██████████| 1000/1000 [01:37<00:00, 10.22it/s]


Accuracy with 0-shot: 0.8770, F1: 0.8769 (avg retries: 0.09)
Corrected metrics (ignoring max retries): Accuracy: 0.9164, F1: 0.9164
Valid samples: 957/1000 (95.7%)

Evaluating with 1-shot prompting...


100%|██████████| 1000/1000 [01:57<00:00,  8.49it/s]


Accuracy with 1-shot: 0.8700, F1: 0.8696 (avg retries: 0.09)
Corrected metrics (ignoring max retries): Accuracy: 0.9081, F1: 0.9078
Valid samples: 958/1000 (95.8%)

Evaluating with 3-shot prompting...


100%|██████████| 1000/1000 [02:24<00:00,  6.91it/s]


Accuracy with 3-shot: 0.8320, F1: 0.8308 (avg retries: 0.09)
Corrected metrics (ignoring max retries): Accuracy: 0.8694, F1: 0.8684
Valid samples: 957/1000 (95.7%)

Evaluating with 5-shot prompting...


100%|██████████| 1000/1000 [03:29<00:00,  4.77it/s]


Accuracy with 5-shot: 0.8540, F1: 0.8535 (avg retries: 0.09)
Corrected metrics (ignoring max retries): Accuracy: 0.8924, F1: 0.8920
Valid samples: 957/1000 (95.7%)

Evaluating with 10-shot prompting...


100%|██████████| 1000/1000 [05:44<00:00,  2.90it/s]


Accuracy with 10-shot: 0.8780, F1: 0.8779 (avg retries: 0.09)
Corrected metrics (ignoring max retries): Accuracy: 0.9175, F1: 0.9174
Valid samples: 957/1000 (95.7%)

Evaluating with 20-shot prompting...


100%|██████████| 1000/1000 [31:27<00:00,  1.89s/it]

Accuracy with 20-shot: 0.0000, F1: 0.0000 (avg retries: 2.00)
Corrected metrics (ignoring max retries): Accuracy: 0.0000, F1: 0.0000
Valid samples: 0/1000 (0.0%)
Total retries needed: 1215





In [9]:
# Print results table
print("\nResults Summary:")
print(results_df.to_string(index=False))


Results Summary:
 Shots  Accuracy  Corrected Accuracy  F1 Score  Corrected F1 Score  Avg Retries  Positive Precision  Negative Precision  Positive Recall  Negative Recall
     0     0.877            0.916405  0.876923            0.916351        0.086            0.859048            0.896842            0.902            0.852
     1     0.870            0.908142  0.869620            0.907849        0.085            0.914798            0.833935            0.816            0.924
     3     0.832            0.869383  0.830806            0.868381        0.086            0.784247            0.899038            0.916            0.748
     5     0.854            0.892372  0.853507            0.891982        0.086            0.817204            0.900452            0.912            0.796
    10     0.878            0.917450  0.877875            0.917361        0.086            0.855263            0.903846            0.910            0.846
    20     0.000            0.000000  0.000000            

It seems that the 10-shot and 20-shot doesn't format it properly, but gives 'ositive' and 'egative' as answers (based on debugging statements that were removed to improve readability). For the sake of experimentation, let's just test out what its accuracy is if these can be taken as correct answers. Let's start with the 20-shot one

In [10]:
# Modified prediction function to allow partial matches like 'ositive' and 'egative'
def predict_sentiment_flexible(prompt, text, examples, shot_count=20, max_retries=2):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    retries = 0
    while retries <= max_retries:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                temperature=0.1,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = full_response[len(prompt):]

        # Clean up the prediction - strip whitespace and lowercase
        prediction = prediction.lower().strip()

        # Now checking for standard or partial matches
        if prediction.startswith("positive") or prediction.startswith("ositive"):
            return "positive", 1, retries
        elif prediction.startswith("negative") or prediction.startswith("egative"):
            return "negative", 0, retries

        first_word = prediction.split()[0] if prediction.split() else ""
        if first_word in ["positive", "ositive"] or first_word.startswith("pos") or first_word.startswith("osi"):
            return "positive", 1, retries
        elif first_word in ["negative", "egative"] or first_word.startswith("neg") or first_word.startswith("ega"):
            return "negative", 0, retries

        if retries == max_retries:
            return prediction, -1, retries

        # Otherwise, create a stronger prompt and retry
        retries += 1

        # Create a stronger prompt with more flexible matching instructions
        system_prompt = (
            "You are a sentiment analysis expert. Given a movie review, classify the sentiment as either positive or negative. "
            "CRITICAL INSTRUCTION: Your response MUST START with EITHER 'positive' OR 'negative' (or just 'ositive'/'egative'). "
            "Only output the sentiment classification without any explanation."
        )

        if shot_count == 0:
            prompt = f"{system_prompt}\n\nReview: {text}\nSentiment:"
        else:
            few_shot_examples = format_few_shot_examples(examples[:shot_count])
            prompt = f"{system_prompt}\n\n{few_shot_examples}Review: {text}\nSentiment:"

        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

# Run 20-shot evaluation using the flexible prediction function
def run_20_shot_evaluation(num_samples=50):
    print(f"Running 20-shot evaluation on {num_samples} test samples with flexible matching...")

    # Get examples from training set for 20-shot prompting
    train_examples = prepare_examples(list(imdb["train"]), 20)

    # Sample from test set
    test_sample = random.sample(list(imdb["test"]), num_samples)

    # Track results
    predictions = []
    true_labels = []
    # Add corrected predictions tracking
    corrected_predictions = []
    corrected_true_labels = []
    details = []
    retry_counts = []

    for item in tqdm(test_sample):
        prompt = create_prompt(item["text"], train_examples, 20)
        prediction, pred_label, retry_count = predict_sentiment_flexible(prompt, item["text"], train_examples, 20)

        # If model still couldn't give a clear answer after retries, count it as wrong
        if pred_label == -1:
            # Assign a label opposite to the true label to ensure it's counted as wrong
            pred_label = 1 - item["label"]
            predictions.append(pred_label)
            true_labels.append(item["label"])
        else:
            predictions.append(pred_label)
            true_labels.append(item["label"])

            # For corrected metrics, only include valid predictions
            corrected_predictions.append(pred_label)
            corrected_true_labels.append(item["label"])

        retry_counts.append(retry_count)

        # Store details for analysis
        details.append({
            "text_preview": item["text"][:100] + "...",
            "true_label": "positive" if item["label"] == 1 else "negative",
            "predicted": prediction,
            "correct": pred_label == item["label"],
            "retries": retry_count,
            "exceeded_max_retries": pred_label == -1
        })

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    # Calculate corrected metrics
    corrected_accuracy = accuracy_score(corrected_true_labels, corrected_predictions) if corrected_true_labels else 0
    corrected_f1 = f1_score(corrected_true_labels, corrected_predictions, average='weighted') if corrected_true_labels else 0

    report = classification_report(true_labels, predictions, target_names=["negative", "positive"], output_dict=True)
    avg_retries = sum(retry_counts) / len(retry_counts)

    print(f"\n--- 20-Shot Results with Flexible Matching ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Corrected accuracy: {corrected_accuracy:.4f}")
    print(f"Corrected F1 score: {corrected_f1:.4f}")
    print(f"Valid samples: {len(corrected_true_labels)}/{len(test_sample)} ({len(corrected_true_labels)/len(test_sample)*100:.1f}%)")
    print(f"Average retries: {avg_retries:.2f}")
    print(f"Positive precision: {report['positive']['precision']:.4f}")
    print(f"Negative precision: {report['negative']['precision']:.4f}")
    print(f"Positive recall: {report['positive']['recall']:.4f}")
    print(f"Negative recall: {report['negative']['recall']:.4f}")

    # Return results dict and details for further analysis
    results = {
        "accuracy": accuracy,
        "f1_score": f1,
        "corrected_accuracy": corrected_accuracy,
        "corrected_f1_score": corrected_f1,
        "avg_retries": avg_retries,
        "report": report,
        "details": details
    }

    # Save detailed results to CSV
    pd.DataFrame(details).to_csv("flexible_20shot_results.csv", index=False)

    return results

# Execute the 20-shot evaluation
results_20shot = run_20_shot_evaluation(num_samples=1000)

Running 20-shot evaluation on 1000 test samples with flexible matching...


100%|██████████| 1000/1000 [12:29<00:00,  1.33it/s]


--- 20-Shot Results with Flexible Matching ---
Accuracy: 0.7580
F1 Score: 0.7551
Corrected accuracy: 0.8376
Corrected F1 score: 0.8350
Valid samples: 905/1000 (90.5%)
Average retries: 0.19
Positive precision: 0.8560
Negative precision: 0.6956
Positive recall: 0.6416
Negative recall: 0.8836





Now, lets try with the 10-shot one

In [11]:
# Modified prediction function to allow partial matches like 'ositive' and 'egative'
def predict_sentiment_flexible(prompt, text, examples, shot_count=10, max_retries=2):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    retries = 0
    while retries <= max_retries:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,
                temperature=0.1,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = full_response[len(prompt):]

        # Clean up the prediction - strip whitespace and lowercase
        prediction = prediction.lower().strip()

        # Now checking for standard or partial matches
        if prediction.startswith("positive") or prediction.startswith("ositive"):
            return "positive", 1, retries
        elif prediction.startswith("negative") or prediction.startswith("egative"):
            return "negative", 0, retries

        first_word = prediction.split()[0] if prediction.split() else ""
        if first_word in ["positive", "ositive"] or first_word.startswith("pos") or first_word.startswith("osi"):
            return "positive", 1, retries
        elif first_word in ["negative", "egative"] or first_word.startswith("neg") or first_word.startswith("ega"):
            return "negative", 0, retries

        if retries == max_retries:
            return prediction, -1, retries

        # Otherwise, create a stronger prompt and retry
        retries += 1

        # Create a stronger prompt with more flexible matching instructions
        system_prompt = (
            "You are a sentiment analysis expert. Given a movie review, classify the sentiment as either positive or negative. "
            "CRITICAL INSTRUCTION: Your response MUST START with EITHER 'positive' OR 'negative' (or just 'ositive'/'egative'). "
            "Only output the sentiment classification without any explanation."
        )

        if shot_count == 0:
            prompt = f"{system_prompt}\n\nReview: {text}\nSentiment:"
        else:
            few_shot_examples = format_few_shot_examples(examples[:shot_count])
            prompt = f"{system_prompt}\n\n{few_shot_examples}Review: {text}\nSentiment:"

        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

# Run 10-shot evaluation using the flexible prediction function
def run_10_shot_evaluation(num_samples=50):
    print(f"Running 10-shot evaluation on {num_samples} test samples with flexible matching...")

    # Get examples from training set for 10-shot prompting
    train_examples = prepare_examples(list(imdb["train"]), 10)

    # Sample from test set
    test_sample = random.sample(list(imdb["test"]), num_samples)

    # Track results
    predictions = []
    true_labels = []
    # Add corrected predictions tracking
    corrected_predictions = []
    corrected_true_labels = []
    details = []
    retry_counts = []

    for item in tqdm(test_sample):
        prompt = create_prompt(item["text"], train_examples, 10)
        prediction, pred_label, retry_count = predict_sentiment_flexible(prompt, item["text"], train_examples, 10)

        # If model still couldn't give a clear answer after retries, count it as wrong
        if pred_label == -1:
            # Assign a label opposite to the true label to ensure it's counted as wrong
            pred_label = 1 - item["label"]
            predictions.append(pred_label)
            true_labels.append(item["label"])
        else:
            predictions.append(pred_label)
            true_labels.append(item["label"])

            # For corrected metrics, only include valid predictions
            corrected_predictions.append(pred_label)
            corrected_true_labels.append(item["label"])

        retry_counts.append(retry_count)

        # Store details for analysis
        details.append({
            "text_preview": item["text"][:100] + "...",
            "true_label": "positive" if item["label"] == 1 else "negative",
            "predicted": prediction,
            "correct": pred_label == item["label"],
            "retries": retry_count,
            "exceeded_max_retries": pred_label == -1
        })

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    # Calculate corrected metrics
    corrected_accuracy = accuracy_score(corrected_true_labels, corrected_predictions) if corrected_true_labels else 0
    corrected_f1 = f1_score(corrected_true_labels, corrected_predictions, average='weighted') if corrected_true_labels else 0

    report = classification_report(true_labels, predictions, target_names=["negative", "positive"], output_dict=True)
    avg_retries = sum(retry_counts) / len(retry_counts)

    print(f"\n--- 10-Shot Results with Flexible Matching ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Corrected accuracy: {corrected_accuracy:.4f}")
    print(f"Corrected F1 score: {corrected_f1:.4f}")
    print(f"Valid samples: {len(corrected_true_labels)}/{len(test_sample)} ({len(corrected_true_labels)/len(test_sample)*100:.1f}%)")
    print(f"Average retries: {avg_retries:.2f}")
    print(f"Positive precision: {report['positive']['precision']:.4f}")
    print(f"Negative precision: {report['negative']['precision']:.4f}")
    print(f"Positive recall: {report['positive']['recall']:.4f}")
    print(f"Negative recall: {report['negative']['recall']:.4f}")

    # Return results dict and details for further analysis
    results = {
        "accuracy": accuracy,
        "f1_score": f1,
        "corrected_accuracy": corrected_accuracy,
        "corrected_f1_score": corrected_f1,
        "avg_retries": avg_retries,
        "report": report,
        "details": details
    }

    # Save detailed results to CSV
    pd.DataFrame(details).to_csv("flexible_10shot_results.csv", index=False)

    return results

# Execute the 10-shot evaluation
results_10shot = run_10_shot_evaluation(num_samples=1000)

Running 10-shot evaluation on 1000 test samples with flexible matching...


100%|██████████| 1000/1000 [06:36<00:00,  2.52it/s]


--- 10-Shot Results with Flexible Matching ---
Accuracy: 0.8390
F1 Score: 0.8388
Corrected accuracy: 0.9374
Corrected F1 score: 0.9374
Valid samples: 895/1000 (89.5%)
Average retries: 0.21
Positive precision: 0.8241
Negative precision: 0.8565
Positive recall: 0.8708
Negative recall: 0.8057



