In [1]:
from datasets import load_dataset
import pandas as pd
import csv
import os
from langchain_community.llms import Ollama
from IPython.display import clear_output
from pprint import pprint
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

# Load the SQuAD dataset
squad_dataset = load_dataset("squad")

# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame(squad_dataset['train'])
test_df = pd.DataFrame(squad_dataset['validation'])

# Initialize the LLaMA 3 model using LangChain and Ollama
llm = Ollama(model="llama3")

In [3]:
train_df

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...
...,...,...,...,...,...
87594,5735d259012e2f140011a09d,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,"{'text': ['Oregon'], 'answer_start': [229]}"
87595,5735d259012e2f140011a09e,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,"{'text': ['Rangoon'], 'answer_start': [414]}"
87596,5735d259012e2f140011a09f,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,"{'text': ['Minsk'], 'answer_start': [476]}"
87597,5735d259012e2f140011a0a0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,"{'text': ['1975'], 'answer_start': [199]}"


In [45]:
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    


# Function to format the SQuAD example for question answering
def format_example(example):
    context = example['context']
    question = example['question']
    answer = example['answers']['text'][0]  # Use the first answer
    return context, question, answer

# Function to save predictions
def save_predictions(file_path, predictions):
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Index", "Predicted", "Correct", "Prompt", "Response"])
        writer.writerows(predictions)

# Function to load predictions
def load_predictions(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            return [(int(row[0]), row[1], row[2], row[3], row[4]) for row in reader]
    return []

# Function to evaluate the model
def evaluate_model(dataset, file_path='squad_predictions.csv'):
    # Load existing predictions if they exist
    predictions = load_predictions(file_path)
    completed_indices = {idx for idx, _, _, _, _ in predictions}
    correct = sum(1 for _, pred, label, _, _ in predictions if pred.lower() == label.lower())
    total = len(predictions)

    for idx, example in dataset.iterrows():
        if idx in completed_indices:
            continue  # Skip already processed examples

        context, question, label = format_example(example)
        clear_output(wait=True)
        prompt = f"You are posed with a question answering task. You are given a context containing relevant information and a question about the content. The answer is contained within the context. Answer the question as concisely as possible. Do not add any extra context.\n\nContext: {context}\n\nQuestion: {question}"
        response = llm.invoke(prompt).strip()
        predicted_answer = response.strip()

        # Save the prediction
        predictions.append((idx, predicted_answer, label, prompt, response))
        save_predictions(file_path, predictions)

        rouge_score = scorer.score(predicted_answer.lower(), label.lower())['rougeL'].fmeasure

        if rouge_score > 0.5:
            print('CORRECT')
            correct += 1
        elif label.lower() in predicted_answer.lower():
            print('...CONTAINING LABEL...')
            correct += 1
        total += 1
        pct = (correct / total) * 100
        pprint(prompt)
        
        print(f"Iteration: {idx + 1}")
        print(f"Correct: {pct:.2f}%")
        print(f"Response: {response}")

    accuracy = correct / total
    return accuracy

# Evaluate the model and print the accuracy
accuracy = evaluate_model(test_df)
print(f"SQuAD Validation Accuracy: {accuracy * 100:.2f}%")

...CONTAINING LABEL...
('You are posed with a question answering task. You are given a context '
 'containing relevant information and a question about the content. The answer '
 'is contained within the context. Answer the question as concisely as '
 'possible. Do not add any extra context.\n'
 '\n'
 'Context: The pound-force has a metric counterpart, less commonly used than '
 'the newton: the kilogram-force (kgf) (sometimes kilopond), is the force '
 'exerted by standard gravity on one kilogram of mass. The kilogram-force '
 'leads to an alternate, but rarely used unit of mass: the metric slug '
 '(sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when '
 'subjected to a force of 1 kgf. The kilogram-force is not a part of the '
 'modern SI system, and is generally deprecated; however it still sees use for '
 'some purposes as expressing aircraft weight, jet thrust, bicycle spoke '
 'tension, torque wrench settings and engine output torque. Other arcane units '
 'of forc

In [23]:
def calculate_f1(predicted, label):
    pred_tokens = predicted.split()
    label_tokens = label.split()
    
    common = set(pred_tokens) & set(label_tokens)
    if len(common) == 0:
        return 0
    
    prec = len(common) / len(pred_tokens)
    rec = len(common) / len(label_tokens)
    
    if prec + rec == 0:
        return 0
    
    return 2 * (prec * rec) / (prec + rec)

predicted = "The Tower of Babel"
label = "tower of babel"
calculate_f1(predicted, label)

0.28571428571428575

In [24]:
!pip install rouge-score nltk


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=655a9ba19682e1b92a30f33945c5b29fbd6ae6ddaf03603c306f81a7a1dd38cf
  Stored in directory: /Users/vince/Library/Caches/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: absl-py, rouge-score
Successfully installed absl-py-2.1.0 rouge-score-0.1.2


In [41]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_scores(predicted, label):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_score = scorer.score(label, predicted)['rougeL'].fmeasure
    
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu([label.split()], predicted.split(), smoothing_function=smoothing_function)
    
    return rouge_score, bleu_score

predicted = "The answer is: The Tower of Babel"
label = "The Tower of Babel"
calculate_scores(predicted, label)

(0.7272727272727273, 0.41113361690051975)

In [20]:
pprint(train_df.iloc[0].answers)

{'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}


In [21]:
train_df.iloc[0].context[515:]

'Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [1]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd

df = pd.read_csv('squad_predictions.csv')
df



Unnamed: 0,Index,Predicted,Correct,Prompt,Response
0,0,The Denver Broncos.,Denver Broncos,You are posed with a question answering task. ...,The Denver Broncos.
1,1,The Carolina Panthers.,Carolina Panthers,You are posed with a question answering task. ...,The Carolina Panthers.
2,2,Levi's Stadium in the San Francisco Bay Area a...,"Santa Clara, California",You are posed with a question answering task. ...,Levi's Stadium in the San Francisco Bay Area a...
3,3,The Denver Broncos.,Denver Broncos,You are posed with a question answering task. ...,The Denver Broncos.
4,4,Gold.,gold,You are posed with a question answering task. ...,Gold.
...,...,...,...,...,...
10565,10565,The kilogram-force (kgf).,kilogram-force,You are posed with a question answering task. ...,The kilogram-force (kgf).
10566,10566,The kilogram-force is sometimes referred to as...,kilopond,You are posed with a question answering task. ...,The kilogram-force is sometimes referred to as...
10567,10567,The metric slug.,slug,You are posed with a question answering task. ...,The metric slug.
10568,10568,Kip.,kip,You are posed with a question answering task. ...,Kip.


In [3]:
import re
def normalize_text(text):
    """Normalize text by removing non-alphanumeric characters and converting to lowercase."""
    return re.sub(r'\W+', '', text.lower())

def calculate_scores_and_flags(row):
    correct_answer = str(row['Correct'])
    predicted_answer = str(row['Predicted'])
    
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_score = scorer.score(correct_answer.lower(), predicted_answer.lower())['rougeL'].fmeasure
    
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu([correct_answer.split()], predicted_answer.split(), smoothing_function=smoothing_function)
    
    normalized_correct = normalize_text(correct_answer)
    normalized_predicted = normalize_text(predicted_answer)
    containing_flag = int(normalized_correct in normalized_predicted)
    
    return pd.Series([rouge_score, bleu_score, containing_flag])

# Apply the function to each row to create new columns
df[['Rouge_Score', 'Bleu_Score', 'Containing']] = df.apply(calculate_scores_and_flags, axis=1)

# Display the updated DataFrame
df

Unnamed: 0,Index,Predicted,Correct,Prompt,Response,Rouge_Score,Bleu_Score,Containing
0,0,The Denver Broncos.,Denver Broncos,You are posed with a question answering task. ...,The Denver Broncos.,0.800000,0.113622,1.0
1,1,The Carolina Panthers.,Carolina Panthers,You are posed with a question answering task. ...,The Carolina Panthers.,0.800000,0.113622,1.0
2,2,Levi's Stadium in the San Francisco Bay Area a...,"Santa Clara, California",You are posed with a question answering task. ...,Levi's Stadium in the San Francisco Bay Area a...,0.375000,0.036021,1.0
3,3,The Denver Broncos.,Denver Broncos,You are posed with a question answering task. ...,The Denver Broncos.,0.800000,0.113622,1.0
4,4,Gold.,gold,You are posed with a question answering task. ...,Gold.,1.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...
10565,10565,The kilogram-force (kgf).,kilogram-force,You are posed with a question answering task. ...,The kilogram-force (kgf).,0.666667,0.113622,1.0
10566,10566,The kilogram-force is sometimes referred to as...,kilopond,You are posed with a question answering task. ...,The kilogram-force is sometimes referred to as...,0.181818,0.000000,1.0
10567,10567,The metric slug.,slug,You are posed with a question answering task. ...,The metric slug.,0.500000,0.000000,1.0
10568,10568,Kip.,kip,You are posed with a question answering task. ...,Kip.,1.000000,0.000000,1.0


In [4]:
low_rouge = df[(df['Rouge_Score'] < 0.5) & (df['Containing'] == 1)]
low_rouge.to_csv('low_rouge.csv', index=False)

In [5]:
wrong_answers = df[df['Containing'] == 0]
wrong_answers

Unnamed: 0,Index,Predicted,Correct,Prompt,Response,Rouge_Score,Bleu_Score,Containing
21,21,2016.,2015,You are posed with a question answering task. ...,2016.,0.000000,0.000000,0.0
43,43,Four teams.,4,You are posed with a question answering task. ...,Four teams.,0.000000,0.000000,0.0
47,47,The Panthers have been in the Super Bowl twice...,2,You are posed with a question answering task. ...,The Panthers have been in the Super Bowl twice...,0.000000,0.000000,0.0
56,56,Von Miller forced two forced fumbles in Super ...,2,You are posed with a question answering task. ...,Von Miller forced two forced fumbles in Super ...,0.000000,0.000000,0.0
58,58,Von Miller.,linebacker Von Miller,You are posed with a question answering task. ...,Von Miller.,0.800000,0.090697,0.0
...,...,...,...,...,...,...,...,...
10542,10542,The matrix diagonals of the tensor.,formalism,You are posed with a question answering task. ...,The matrix diagonals of the tensor.,0.000000,0.000000,0.0
10543,10543,Force.,rotational equivalent for position,You are posed with a question answering task. ...,Force.,0.000000,0.000000,0.0
10546,10546,Centripetal force goes toward the center of th...,toward the center of the curving path,You are posed with a question answering task. ...,Centripetal force goes toward the center of th...,0.705882,0.392815,0.0
10553,10553,Mechanical energy.,net mechanical energy,You are posed with a question answering task. ...,Mechanical energy.,0.800000,0.000000,0.0


In [6]:
import pandas as pd

# Assuming df is your DataFrame with the necessary columns
# Calculate the overall average and standard deviation for ROUGE and BLEU scores
avg_rouge = df['Rouge_Score'].mean()
std_rouge = df['Rouge_Score'].std()
avg_bleu = df['Bleu_Score'].mean()
std_bleu = df['Bleu_Score'].std()

# Calculate the accuracy based on the Containing column
accuracy = df['Containing'].mean() * 100  # Convert to percentage

# Calculate statistics for correct predictions (Containing == 1)
correct_df = df[df['Containing'] == 1]
avg_rouge_correct = correct_df['Rouge_Score'].mean()
std_rouge_correct = correct_df['Rouge_Score'].std()
avg_bleu_correct = correct_df['Bleu_Score'].mean()
std_bleu_correct = correct_df['Bleu_Score'].std()

# Calculate statistics for incorrect predictions (Containing == 0)
incorrect_df = df[df['Containing'] == 0]
avg_rouge_incorrect = incorrect_df['Rouge_Score'].mean()
std_rouge_incorrect = incorrect_df['Rouge_Score'].std()
avg_bleu_incorrect = incorrect_df['Bleu_Score'].mean()
std_bleu_incorrect = incorrect_df['Bleu_Score'].std()

# Print the results
print(f"Overall ROUGE Score: {avg_rouge:.4f} (std: {std_rouge:.4f})")
print(f"Overall BLEU Score: {avg_bleu:.4f} (std: {std_bleu:.4f})")
print(f"Accuracy: {accuracy:.2f}%")

print(f"Correct Predictions ROUGE Score: {avg_rouge_correct:.4f} (std: {std_rouge_correct:.4f})")
print(f"Correct Predictions BLEU Score: {avg_bleu_correct:.4f} (std: {std_bleu_correct:.4f})")

print(f"Incorrect Predictions ROUGE Score: {avg_rouge_incorrect:.4f} (std: {std_rouge_incorrect:.4f})")
print(f"Incorrect Predictions BLEU Score: {avg_bleu_incorrect:.4f} (std: {std_bleu_incorrect:.4f})")


Overall ROUGE Score: 0.6660 (std: 0.3371)
Overall BLEU Score: 0.1261 (std: 0.1727)
Accuracy: 85.55%
Correct Predictions ROUGE Score: 0.7182 (std: 0.3127)
Correct Predictions BLEU Score: 0.1338 (std: 0.1754)
Incorrect Predictions ROUGE Score: 0.3566 (std: 0.3099)
Incorrect Predictions BLEU Score: 0.0801 (std: 0.1476)


In [7]:
wrong_answers

Unnamed: 0,Index,Predicted,Correct,Prompt,Response,Rouge_Score,Bleu_Score,Containing
21,21,2016.,2015,You are posed with a question answering task. ...,2016.,0.000000,0.000000,0.0
43,43,Four teams.,4,You are posed with a question answering task. ...,Four teams.,0.000000,0.000000,0.0
47,47,The Panthers have been in the Super Bowl twice...,2,You are posed with a question answering task. ...,The Panthers have been in the Super Bowl twice...,0.000000,0.000000,0.0
56,56,Von Miller forced two forced fumbles in Super ...,2,You are posed with a question answering task. ...,Von Miller forced two forced fumbles in Super ...,0.000000,0.000000,0.0
58,58,Von Miller.,linebacker Von Miller,You are posed with a question answering task. ...,Von Miller.,0.800000,0.090697,0.0
...,...,...,...,...,...,...,...,...
10542,10542,The matrix diagonals of the tensor.,formalism,You are posed with a question answering task. ...,The matrix diagonals of the tensor.,0.000000,0.000000,0.0
10543,10543,Force.,rotational equivalent for position,You are posed with a question answering task. ...,Force.,0.000000,0.000000,0.0
10546,10546,Centripetal force goes toward the center of th...,toward the center of the curving path,You are posed with a question answering task. ...,Centripetal force goes toward the center of th...,0.705882,0.392815,0.0
10553,10553,Mechanical energy.,net mechanical energy,You are posed with a question answering task. ...,Mechanical energy.,0.800000,0.000000,0.0


In [25]:
i = 390
print(wrong_answers.iloc[i]['Prompt'])
print()
print("Response: " + wrong_answers.iloc[i]['Response'])
print("Correct: " + wrong_answers.iloc[i]['Correct'])

You are posed with a question answering task. You are given a context containing relevant information and a question about the content. The answer is contained within the context. Answer the question as concisely as possible. Do not add any extra context.

Context: Oxygen is present in the atmosphere in trace quantities in the form of carbon dioxide (CO
2). The Earth's crustal rock is composed in large part of oxides of silicon (silica SiO
2, as found in granite and quartz), aluminium (aluminium oxide Al
2O
3, in bauxite and corundum), iron (iron(III) oxide Fe
2O
3, in hematite and rust), and calcium carbonate (in limestone). The rest of the Earth's crust is also made of oxygen compounds, in particular various complex silicates (in silicate minerals). The Earth's mantle, of much larger mass than the crust, is largely composed of silicates of magnesium and iron.

Question: Aside from oxides, what other compounds comprise a large portion of the Earth's crust?

Response: Aside from oxides