In [2]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from datasets import load_dataset
import csv
import os
from IPython.display import clear_output
from pprint import pprint


dataset = load_dataset("winogrande", "winogrande_xl")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
dataset['validation']

Dataset({
    features: ['sentence', 'option1', 'option2', 'answer'],
    num_rows: 1267
})

In [4]:
dataset['validation'][0]

{'sentence': 'Sarah was a much better surgeon than Maria so _ always got the easier cases.',
 'option1': 'Sarah',
 'option2': 'Maria',
 'answer': '2'}

In [5]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from datasets import load_dataset
import csv
import os
from IPython.display import clear_output
from pprint import pprint

# Initialize the Llama 3 model using LangChain and Ollama
llm = Ollama(model="llama3")

# Function to format the Winogrande example
def format_example(example):
    sentence = example['sentence']
    option1 = example['option1']
    option2 = example['option2']
    answer = example['answer']
    choices = [option1, option2]
    correct_index = 0 if answer == "1" else 1
    return sentence, choices, correct_index

# Function to save predictions
def save_predictions(file_path, predictions):
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Index", "Predicted", "Correct", "Prompt", "Response"])
        writer.writerows(predictions)

# Function to load predictions
def load_predictions(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            return [(int(row[0]), int(row[1]), int(row[2])) for row in reader]
    return []

example_prompts = [
    {
        "sentence": "Sarah was a much better surgeon than Maria so _ always got the harder cases.",
        "option1": "Sarah",
        "option2": "Maria",
        "answer": "1"
    },
    {
        "sentence": "They were worried the wine would ruin the bed and the blanket, but the _ wasn't ruined.",
        "option1": "blanket",
        "option2": "bed",
        "answer": "2"
    },
    {
        "sentence": "Terry tried to bake the eggplant in the toaster oven but the _ was too big.",
        "option1": "eggplant",
        "option2": "toaster",
        "answer": "1"
    }
]


# Create the initial 3-shot prompt
init_prompt_starter = "The following are pairs of Winograd Schema which are a fill-in-a-blank task with binary options. The goal is to choose the right option for a given sentence which requires commonsense reasoning. The pairs are in the form of a statement S, choices [A] and [B], and an Answer."
init_prompt = init_prompt_starter
for example in example_prompts:
    sentence = example["sentence"]
    option1 = example["option1"]
    option2 = example["option2"]
    answer = f"[A] {option1}" if example["answer"] == "1" else f"[B] {option2}"
    init_prompt += f"\nS: {sentence}\n"
    init_prompt += f"[A] {option1}\n[B] {option2}\nAnswer: {answer}\n"

pprint(init_prompt)

# Function to evaluate the model
def evaluate_model(dataset, file_path='winogrande_predictions_3shot.csv'):
    # Load existing predictions if they exist
    predictions = load_predictions(file_path)
    completed_indices = {idx for idx, _, _ in predictions}
    correct = sum(1 for _, pred, label in predictions if pred == label)
    total = len(predictions)
    labels_list = ['A', 'B']

    for idx, example in enumerate(dataset['validation']):
        if idx in completed_indices:
            continue  # Skip already processed examples

        sentence, choices, correct_index = format_example(example)
        clear_output(wait=True)
        prompt = init_prompt
        prompt += "\nYour task is to resolve the ambiguity in the following sentence.\n"
        prompt += f"S: {sentence}\n"
        
        for i, choice in enumerate(choices):
            prompt += f"[{labels_list[i]}] {choice}\n"
        prompt += "\nRespond only with the correct choice."
        response = llm.invoke(prompt)
        predicted_choice = extract_label(response, labels_list)

        # Save the prediction
        predictions.append((idx, predicted_choice, correct_index, prompt, response))
        save_predictions(file_path, predictions)

        status = "INCORRECT"
        if predicted_choice == correct_index:
            correct += 1
            status = "CORRECT"
        total += 1
        pct = (correct / total) * 100
        
        print(f"Iteration: {idx + 1} {status}")
        print(f"Correct: {pct:.2f}%")
        print(f"Response: {response}")
        pprint(prompt)

    accuracy = correct / total
    return accuracy

# Function to extract the label from the model's response
def extract_label(response, labels_list):
    for i, label in enumerate(labels_list):
        if f"[{label}]" in response:
            return i
    return -1  # Indicates an error or unrecognized format

# Evaluate the model and print the accuracy
accuracy = evaluate_model(dataset)
print(f"Winogrande Test Accuracy: {accuracy * 100:.2f}%")


('The following are pairs of Winograd Schema which are a fill-in-a-blank task '
 'with binary options. The goal is to choose the right option for a given '
 'sentence which requires commonsense reasoning. The pairs are in the form of '
 'a statement S, choices [A] and [B], and an Answer.\n'
 'S: Sarah was a much better surgeon than Maria so _ always got the harder '
 'cases.\n'
 '[A] Sarah\n'
 '[B] Maria\n'
 'Answer: [A] Sarah\n'
 '\n'
 'S: They were worried the wine would ruin the bed and the blanket, but the _ '
 "wasn't ruined.\n"
 '[A] blanket\n'
 '[B] bed\n'
 'Answer: [B] bed\n'
 '\n'
 'S: Terry tried to bake the eggplant in the toaster oven but the _ was too '
 'big.\n'
 '[A] eggplant\n'
 '[B] toaster\n'
 'Answer: [A] eggplant\n')
Winogrande Test Accuracy: 53.51%


In [1]:
import pandas as pd 

df = pd.read_csv('winogrande_predictions.csv')
df

Unnamed: 0,Index,Predicted,Correct,Prompt,Response
0,0,0,1,Your task is to resolve the ambiguous pronoun ...,The correct answer is:\n\n[A] Sarah\n\nThis is...
1,1,0,0,Your task is to resolve the ambiguous pronoun ...,The correct answer is [A] Sarah.\n\nThe reason...
2,2,0,1,Your task is to resolve the ambiguous pronoun ...,I'd choose [A] blanket.\n\nThe sentence sugges...
3,3,0,0,Your task is to resolve the ambiguous pronoun ...,The correct answer is [A] eggplant.\n\nIn this...
4,4,0,0,Your task is to resolve the ambiguous pronoun ...,The correct answer is [A] Jeffrey.\n\nIn this ...
...,...,...,...,...,...
1262,1262,1,0,Your task is to resolve the ambiguous pronoun ...,The correct answer is [B] Neil.\n\nThe sentenc...
1263,1263,0,0,Your task is to resolve the ambiguous pronoun ...,The correct answer is [A] Joel.\n\nExplanation...
1264,1264,1,0,Your task is to resolve the ambiguous pronoun ...,The correct answer is [B] Lindsey.\n\nThe pron...
1265,1265,0,0,Your task is to resolve the ambiguous pronoun ...,"I would choose [A] aquarium.\n\nThe pronoun ""w..."


In [2]:
from pprint import pprint 
pprint(df.iloc[0]['Prompt'])

('Your task is to resolve the ambiguous pronoun in the following sentence.\n'
 'Sentence: Sarah was a much better surgeon than Maria so _ always got the '
 'easier cases.\n'
 'Respond with one of the following choices:\n'
 '[A] Sarah\n'
 '[B] Maria\n'
 'Which choice resolves the ambiguity?')


In [5]:
accuracy = sum(df['Predicted'] == df['Correct']) / df.shape[0] * 100
accuracy

54.69613259668509

In [9]:
df['Predicted'].value_counts() / df.shape[0]

Predicted
0    0.598264
1    0.401736
Name: count, dtype: float64

In [10]:
df['Correct'].value_counts() / df.shape[0]

Correct
1    0.504341
0    0.495659
Name: count, dtype: float64

In [16]:
import pandas as pd

# Load the CSV files into DataFrames
df_single_shot = pd.read_csv('winogrande_predictions.csv')
df_three_shot = pd.read_csv('winogrande_predictions_3shot.csv')

# Calculate accuracy for both approaches
accuracy_single_shot = (df_single_shot['Predicted'] == df_single_shot['Correct']).mean() * 100
accuracy_three_shot = (df_three_shot['Predicted'] == df_three_shot['Correct']).mean() * 100

# Calculate the length of each response
df_single_shot['Response_Length'] = df_single_shot['Response'].apply(len)
df_three_shot['Response_Length'] = df_three_shot['Response'].apply(len)

# Calculate overall average and standard deviation of response lengths for both approaches
avg_length_single_shot = df_single_shot['Response_Length'].mean()
std_length_single_shot = df_single_shot['Response_Length'].std()

avg_length_three_shot = df_three_shot['Response_Length'].mean()
std_length_three_shot = df_three_shot['Response_Length'].std()

# Calculate statistics for correct and incorrect responses for both approaches
def calculate_length_stats(df):
    correct_responses = df[df['Predicted'] == df['Correct']]
    incorrect_responses = df[df['Predicted'] != df['Correct']]
    
    stats = {
        'avg_length_correct': correct_responses['Response_Length'].mean(),
        'std_length_correct': correct_responses['Response_Length'].std(),
        'avg_length_incorrect': incorrect_responses['Response_Length'].mean(),
        'std_length_incorrect': incorrect_responses['Response_Length'].std()
    }
    return stats

stats_single_shot = calculate_length_stats(df_single_shot)
stats_three_shot = calculate_length_stats(df_three_shot)

# Compare the distribution of predicted labels for both approaches
distribution_single_shot = df_single_shot['Predicted'].value_counts(normalize=True)
distribution_three_shot = df_three_shot['Predicted'].value_counts(normalize=True)

# Print the results
print(f"Zero-Shot Accuracy: {accuracy_single_shot:.2f}%")
print(f"Three-Shot Accuracy: {accuracy_three_shot:.2f}%")

print(f"Zero-Shot Overall Response Length: {avg_length_single_shot:.2f} (std: {std_length_single_shot:.2f})")
print(f"Three-Shot Overall Response Length: {avg_length_three_shot:.2f} (std: {std_length_three_shot:.2f})")

print(f"Zero-Shot Correct Response Length: {stats_single_shot['avg_length_correct']:.2f} (std: {stats_single_shot['std_length_correct']:.2f})")
print(f"Zero-Shot Incorrect Response Length: {stats_single_shot['avg_length_incorrect']:.2f} (std: {stats_single_shot['std_length_incorrect']:.2f})")

print(f"Three-Shot Correct Response Length: {stats_three_shot['avg_length_correct']:.2f} (std: {stats_three_shot['std_length_correct']:.2f})")
print(f"Three-Shot Incorrect Response Length: {stats_three_shot['avg_length_incorrect']:.2f} (std: {stats_three_shot['std_length_incorrect']:.2f})")

print("Zero-Shot Predicted Distribution:")
print(distribution_single_shot)

print("\nThree-Shot Predicted Distribution:")
print(distribution_three_shot)


Zero-Shot Accuracy: 54.70%
Three-Shot Accuracy: 53.51%
Zero-Shot Overall Response Length: 342.97 (std: 118.78)
Three-Shot Overall Response Length: 9.89 (std: 1.96)
Zero-Shot Correct Response Length: 340.13 (std: 115.25)
Zero-Shot Incorrect Response Length: 346.41 (std: 122.91)
Three-Shot Correct Response Length: 9.89 (std: 2.00)
Three-Shot Incorrect Response Length: 9.90 (std: 1.92)
Zero-Shot Predicted Distribution:
Predicted
0    0.598264
1    0.401736
Name: proportion, dtype: float64

Three-Shot Predicted Distribution:
Predicted
 0    0.507498
 1    0.483031
-1    0.009471
Name: proportion, dtype: float64


In [14]:
df_single_shot.iloc[0]['Prompt']

'Your task is to resolve the ambiguous pronoun in the following sentence.\nSentence: Sarah was a much better surgeon than Maria so _ always got the easier cases.\nRespond with one of the following choices:\n[A] Sarah\n[B] Maria\nWhich choice resolves the ambiguity?'

In [37]:
def display_response(i):
    row = df_three_shot.iloc[i]
    answers = ['[A]', '[B]']
    predicted = answers[row['Predicted']]
    correct = answers[row['Correct']]
    prompt = row['Prompt']
    response = row['Response']
    print('Prompt: ' + prompt)
    print('Response: ' + response)
    print('Correct: ' + correct)

display_response(75)

Prompt: The following are pairs of Winograd Schema which are a fill-in-a-blank task with binary options. The goal is to choose the right option for a given sentence which requires commonsense reasoning. The pairs are in the form of a statement S, choices [A] and [B], and an Answer.
S: Sarah was a much better surgeon than Maria so _ always got the harder cases.
[A] Sarah
[B] Maria
Answer: [A] Sarah

S: They were worried the wine would ruin the bed and the blanket, but the _ wasn't ruined.
[A] blanket
[B] bed
Answer: [B] bed

S: Terry tried to bake the eggplant in the toaster oven but the _ was too big.
[A] eggplant
[B] toaster
Answer: [A] eggplant

Your task is to resolve the ambiguity in the following sentence.
S: The teen found the new hat was no substitute for his cool shirt.  The _ was just cool to wear to school.
[A] hat
[B] shirt

Respond only with the correct choice.
Response: [A] hat
Correct: [B]
