# MLFlow test
This is a trial of mlflow RAG evaluation.

In [None]:
# # These packages may need to be installed via 'pip' or 'poetry add'
# !pip install chromadb mlflow textstat

In [None]:
import pandas as pd
import mlflow

from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_openai import OpenAI, OpenAIEmbeddings
from mlflow.metrics.genai import relevance, faithfulness

from dotenv import load_dotenv

load_dotenv()

### 1. Load PDF, chunk, embed and store

In [None]:
loader = PyPDFLoader("/Users/*/Documents/test_files/Science-Trust-and-Policy-report_TEST.pdf")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)

### 2. Define a function to get a question, retrieve a chunk and append an answer from the model

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0),
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
)


def model(input_df):
    answer = []
    for index, row in input_df.iterrows():
        answer.append(qa(row["question"]))

    return answer

### 3. Load test dataset derived from PDF at step 1
This dataset has been shorted and filtered for cost purposes. All you need are questions and ground truthed answers. These could be derived from humans or an LLM e.g. RAGAS.

In [None]:
eval_df = pd.read_excel("/Users/*/Documents/test_files/testset.xlsx")
eval_df = eval_df[eval_df.document == "stp"]
eval_df["ground_truth"] = eval_df["answer"]
eval_df = eval_df[["question", "ground_truth"]].reset_index(drop=True)
eval_df = eval_df.iloc[:2]

### 4. Define additional metrics

In [None]:
faithfulness_metric = faithfulness(model="openai:/gpt-3.5-turbo")
relevance_metric = relevance(model="openai:/gpt-3.5-turbo")

### 5. Run experiment and log results

In [None]:
mlflow.set_experiment("test")

results = mlflow.evaluate(
    model,
    data=eval_df,
    model_type="question-answering",
    evaluators="default",
    predictions="result",
    targets="ground_truth",
    extra_metrics=[faithfulness_metric, relevance_metric],
    evaluator_config={
        "col_mapping": {
            "inputs": "question",
            "context": "source_documents",
        },
        "system_prompt": "Provide an answer in 3 sentences based on the context.",
        "k": 1,
    },
)
print(results.metrics)
display(results.tables["eval_results_table"])

mlflow.end_run()

### 6. Save results as CSV

In [None]:
results.tables["eval_results_table"].to_csv(
    "/Users/*/Documents/code/redbox-copilot/notebooks/evaluation/data/evaluation_files/test_1.csv"
)