In [15]:
import pandas as pd
from datasets import Dataset, Features, Sequence, Value
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from langchain_community.llms import Ollama
from langchain_community.embeddings import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Initialize Ollama LLM and Embeddings
llm_ans: BaseLanguageModel = Ollama(model="mistral", temperature = 0)
llm_eva: BaseLanguageModel = Ollama(model="llama3", temperature = 0)
embeddings: Embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [16]:
# Load the dataset from a local CSV file
df = pd.read_csv('halubench.csv')

# Ensure the dataset has the required columns: 'question', 'contexts', 'ground_truth'
required_columns = ['question', 'contexts', 'ground_truth']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The dataset must contain these columns: {required_columns}")

# Convert contexts column to a list of strings if it's not already
df['contexts'] = df['contexts'].apply(lambda x: x if isinstance(x, list) else [x])

In [17]:
# Function to generate answer using LLaMA 3
def generate_answer(question, context):
    prompt = PromptTemplate(
        input_variables=["question", "context"],
        template="Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    )
    chain = LLMChain(llm=llm_ans, prompt=prompt)
    return chain.run({"question": question, "context": context})

In [18]:
# Generate answers using LLaMA 3
df['answer'] = df.apply(lambda row: generate_answer(row['question'], row['contexts']), axis=1)


In [19]:
# Select required columns for the Hugging Face dataset
df_hf = df[required_columns + ['answer']]

# Define the schema for the Hugging Face dataset
features = Features({
    'question': Value('string'),
    'contexts': Sequence(Value('string')),
    'ground_truth': Value('string'),
    'answer': Value('string')
})


In [20]:
# Convert the pandas DataFrame to a Hugging Face Dataset with the specified schema
dataset = Dataset.from_pandas(df_hf, features=features)

# Evaluate using Ragas
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy, answer_correctness],
    llm=llm_eva,
    embeddings=embeddings
)


Evaluating:   0%|          | 0/180 [00:00<?, ?it/s]

Failed to parse output. Returning None.
Exception raised in Job[172]: TimeoutError()
Failed to parse output. Returning None.
Exception raised in Job[140]: TimeoutError()
Failed to parse output. Returning None.
Exception raised in Job[175]: TimeoutError()
Exception raised in Job[74]: TimeoutError()
Exception raised in Job[142]: TimeoutError()
Exception raised in Job[25]: TimeoutError()
Exception raised in Job[47]: TimeoutError()
Exception raised in Job[176]: TimeoutError()
Exception raised in Job[143]: TimeoutError()
Exception raised in Job[34]: TimeoutError()
Failed to parse output. Returning None.
Exception raised in Job[110]: TimeoutError()
Exception raised in Job[178]: TimeoutError()
Exception raised in Job[179]: TimeoutError()
Exception raised in Job[113]: TimeoutError()
Exception raised in Job[41]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Exception raised in Job[82]: TimeoutError()
Exception raised in Job[116]: TimeoutE

In [23]:
# Print the results
print(results)

# Save the results to a CSV file
results.to_pandas().to_csv('halubench_results.csv', index=False)


{'faithfulness': 0.8499, 'answer_relevancy': 0.6872, 'answer_correctness': 0.5652}


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,contexts,question,ground_truth,label,source_ds,answer
0,0,d3fb4c3c-d21b-480a-baa0-98d6d0d17c1d,[Hoping to rebound from the road loss to the C...,Which team scored the longest field goal kick ...,"['Rams', 'second', 'Marc Bulger', 'Kevin Curtis']",FAIL,DROP,The team that kicked the longest field goal i...
1,1,8603663e-c53b-46db-a482-a867f12ff3b4,"[As of the census of 2000, there were 218,590 ...",How many percent were not Irish?,87.1,FAIL,DROP,"77.5% were not Irish (100% - 22.5%, which is t..."
2,2,c63a73e5-2c91-489b-bd24-af150ddfa82c,[Hoping to rebound from the road loss to the C...,How many yards was the second longest field go...,42,FAIL,DROP,41 yards (Jeff Wilkins made a 41-yard field goal)
3,3,52db14ed-5426-46ec-b0ae-4ef843b2d692,[Hoping to rebound from their tough overtime r...,How long was the last touchdown?,18-yard,FAIL,DROP,15 yards
4,4,31b36417-aad1-412c-b0e5-9c1faaed233f,"[As of the census of 2000, there were 218,590 ...",How many in percent from the census weren't Ir...,87.1,FAIL,DROP,77.5% of the population were not Irish (100% -...
