In [None]:
## Experiment Notebook for RAG Approach with Ollama and Mistral Model

# Setup Environment
import os
import pandas as pd
from gemma import (
    create_dataset, 
    generate_faiss_vectorstores, 
    run_experiment, 
    generate_results
)
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# Set embedding model
embedding_model_name = "allenai/scibert_scivocab_uncased"
model_kwargs = {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs
)

# Define Experiment Parameters
n_samples_per_category = 2000
test_size = 0.10
feature_sets = [
    ["Title", "Abstract"],  # Text Only
    ["Title", "Abstract", "Flesch Reading Ease", "Gunning Fog Index"],  # Text + Readability
]
k_values = [5]  # Different values of `k` for retrieval

# Prepare Dataset
dataset_path = "../../Dataset Creation/2018-2022 Data.csv"
df_knowledge_base, df_test = create_dataset(dataset_path, n_samples_per_category, test_size)

In [None]:
# Generate FAISS Vector Stores with Parallel Processing
vectorstores = generate_faiss_vectorstores(feature_sets, df_knowledge_base, embeddings, max_workers=4)

In [None]:
# Run Experiment
results_path = "../../results/gemma rag predictions.csv"
run_experiment(feature_sets, k_values, df_test, vectorstores, results_path)

In [None]:
# Generate and Display Results
metrics_results = generate_results(results_path)
metrics_results.head()

In [None]:

# Save Results to CSV
metrics_results.to_csv("../../results/gemma rag metrics.csv", index=False)