In [None]:
!pip install sentence-transformers
!pip install langchain-openai
!pip install pysbd
#!pip install ragas

In [3]:
import torch
from torch import cuda, bfloat16

import gc

from transformers import AutoTokenizer, BitsAndBytesConfig
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader, TextLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from langchain.evaluation import Criteria
from langchain.evaluation import load_evaluator, EvaluatorType

import transformers
from transformers import AutoTokenizer, BitsAndBytesConfig

from datasets import Dataset

import os
import pandas as pd

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
 )

In [4]:
def clear_gpu_memory():
    gc.collect()
    cuda.empty_cache()
    print("GPU memory has been freed.")


def report_memory_usage(cuda_device_id):
    # Set PyTorch memory allocation configuration
    torch.backends.cuda.max_split_size_mb = 0

    # Example usage:
    # Place this function call at appropriate points in your code to free up GPU memory
    # clear_gpu_memory()

    # Check available GPU memory
    gpu_memory_info = torch.cuda.memory_stats()

    # Print all keys in gpu_memory_info
    print("Keys in gpu_memory_info:", gpu_memory_info.keys())

    # Extract and print specific information
    free_memory = gpu_memory_info.get('free_bytes', 0) / 1024**2  # Convert to MiB
    used_memory = gpu_memory_info.get('allocated_bytes.all.peak', 0) / 1024**2  # Convert to MiB
    process_memory = gpu_memory_info.get('allocated_bytes.current', 0) / 1024**2  # Convert to MiB

    print(f"{free_memory:.2f} MiB is free.")
    print(f"Including non-PyTorch memory, this process has {used_memory:.2f} GiB memory in use.")
    print(f"Process on cuda device {cuda_device_id} has {process_memory:.2f} GiB memory in use.")

    # Check PyTorch memory allocation details
    pytorch_memory_info = torch.cuda.memory_reserved()

    print(f"Total GPU memory reserved by PyTorch: {pytorch_memory_info / 1024**2:.2f} MiB")

In [5]:
def prepare_model(model_id, cuda_device='cuda:0', max_length=512): 
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )

    model_config = transformers.AutoConfig.from_pretrained(
        model_id
    )
    
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map=cuda_device,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    return model, tokenizer

def prepare_pipeline(model, tokenizer, cuda_device='cuda:1', max_new_tokens=512, temperature=1e-4, repetition_penalty=1.1):
    query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map=cuda_device,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=repetition_penalty,
        return_full_text=True,
    )
    
    return HuggingFacePipeline(pipeline=query_pipeline)

def get_answers(qa, retriever, var_values):
    questions = ["What did the president say about Justice Breyer?", 
             "What did the president say about Intel's CEO?",
             "What did the president say about gun violence?",
            ]

    ground_truth = ["The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service.",
                    "The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion.",
                    "The president asked Congress to pass proven measures to reduce gun violence."]
    answers = []
    contexts = []
    sources = []
    model_ids = []
    emb_function = []
    chunk_size = []
    chunk_overlap = []
    textsplitter = []
    vectordb = []
    retriever_k = []
    temp_prompt = []

    for question in questions:
        cont = []
        src = []
        answers.append(qa.invoke(question)["result"])
        for context in retriever.get_relevant_documents(question):
            cont.append(context.page_content)
            src.append(context.metadata["source"])
        # contexts.append([context.page_content for context in retriever.get_relevant_documents(question)])
        contexts.append(cont)
        sources.append(src)

        model_ids.append(var_values["model_id"])
        emb_function.append(var_values["emb_function"])
        chunk_size.append(var_values["chunk_size"])
        chunk_overlap.append(var_values["chunk_overlap"])
        textsplitter.append(var_values["textsplitter"])
        vectordb.append(var_values["vectordb"])
        retriever_k.append(var_values["retriever_k"])
        temp_prompt.append(var_values["temp_prompt"])
    

    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truth,
        "sources": sources,
        "model_id": model_ids,
        "emb_function": emb_function,
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "textsplitter": textsplitter,
        "vectordb": vectordb,
        "retriever_k": retriever_k,
        "temp_prompt": temp_prompt
    }

    dataset = Dataset.from_dict(data)
    df = dataset.to_pandas()
    
    return df, dataset


# merge_axis: 0: merge rows, 1: merge columns
def merge_datasets(dataset_list, merge_axis=0):
    if len(dataset_list) > 0:
        pd_dfs = [dataset.to_pandas() for dataset in dataset_list]

        for i,df in enumerate(pd_dfs):
            if i==0:
                merged_df = df
            else:
                merged_df = pd.concat([merged_df, df], axis=merge_axis)

        merged_df.reset_index(drop=True, inplace=True)
        return merged_df, Dataset.from_pandas(merged_df)
    else:
        return None, None
    

def save_df(df,dir,file_name):
    if not os.path.exists(dir):
        os.makedirs(dir)

    df.to_csv(os.path.join(dir,file_name))

In [6]:
save_dir = "results"
save_file_name = "results_multi_query.csv"
save_file_name_eval = "eval_results_multi_query.csv"


In [7]:
cuda_device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
eval_device = cuda_device

eval_model_id = "mistralai/Mixtral-8x7B"

text_loader_kwargs = {'autodetect_encoding': True}

## Evaluation

In [8]:
eval_model, eval_tokenizer = prepare_model(eval_model_id,eval_device)
eval_llm = prepare_pipeline(eval_model, eval_tokenizer, eval_device, max_new_tokens=4096)

embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

2024-04-13 08:09:06.591467: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not is_torch_available():


#### Load dataset from file (if needed)

In [None]:
import pickle
import numpy as np
from datasets import Dataset, DatasetDict
semantic_chunking_both_gens = None
with open("multi_query_test_sk8x7B", 'rb') as fp:
    semantic_chunking_both_gens = pickle.load(fp)
test_log = semantic_chunking_both_gens
eval_list = [
    {
        "question": log["query"],
        "contexts": [doc.page_content for doc, _ in log["context"]],
        "ground_truth": log["kwargs"]["ground_truths"],
        "answer": log["inference"],
        "chunk_strategie": log["kwargs"]["chunk_strategie"],
        "multi_query": log["multi_query"],
        "documents_in_scope": log["documents_in_scope"],
        "retrieve_methods": log["retrieve_methods"]
    }
    for log in test_log#.to_dict("records")
]
dataset = pd.DataFrame(data=eval_list)
dataset

In [21]:
dataset.to_excel("multi_query_test_sk8x7B.xlsx")

In [24]:
dataset["contexts"] = dataset.apply(lambda x: x["contexts"] if len(x["contexts"]) > 0 else np.nan, axis = 1)
dataset = dataset.dropna()
dataset.reset_index(inplace=True, drop=True)
merged_df = dataset
merged_dataset = Dataset.from_pandas(dataset)
merged_dataset

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'answer', 'chunk_strategie', 'retrieve_methods'],
    num_rows: 328
})

### Ragas metrics

In [None]:
from langchain_core.callbacks import StdOutCallbackHandler
handler = StdOutCallbackHandler()
ragas_result_dataset = evaluate(
     dataset = merged_dataset, 
     metrics=[
         context_precision,
         context_recall,
         #faithfulness,
         #answer_relevancy,
     ],
    #callbacks = [handler],
    llm=eval_llm,
    embeddings=embedding_function
)

ragas_result_df = ragas_result_dataset.to_pandas()

In [None]:
ragas_result_df

### Langchain metrics
[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

In [None]:
from tqdm import tqdm
from langchain.evaluation.criteria.eval_chain import (
    CriteriaEvalChain,
    LabeledCriteriaEvalChain,
)
langchain_eval_metrics = [
    "conciseness", # Is the submission concise and to the point?
    "relevance", # Is the submission referring to a real quote from the text?
    "correctness", # Is the submission correct, accurate, and factual?
    "helpfulness", # Is the submission helpful, insightful, and appropriate?
    "detail" # Does the submission demonstrate attention to detail?   
]

# criterias = [crit.value for crit in list(Criteria)]
eval_results = []

for eval_metric in tqdm(langchain_eval_metrics):
    if eval_metric == "correctness":
        evaluator = load_evaluator("labeled_criteria", llm=eval_llm, criteria=eval_metric)
    else:
        evaluator = load_evaluator(EvaluatorType.LABELED_CRITERIA,
                                   #"criteria",
                                   llm=eval_llm, criteria=eval_metric)
    
    eval_metric_results = []

    for index, row in merged_df.iterrows():
        eval_result = evaluator.evaluate_strings(
        input = row["question"],
        prediction = row["answer"],
        reference = row["ground_truth"])

        eval_metric_results.append(eval_result["score"])

    eval_results.append(eval_metric_results)

print(eval_results)    
langchain_eval_df = pd.DataFrame(list(zip(*eval_results)), columns=langchain_eval_metrics)
langchain_eval_dataset = Dataset.from_pandas(langchain_eval_df)

all_eval_results_df, all_eval_results_dataset = merge_datasets([ragas_result_dataset,langchain_eval_dataset],1)
#save_df(all_eval_results_df,save_dir,save_file_name_eval)

In [None]:
all_eval_results_df

In [None]:
ragas_result_df

In [18]:
with open("LC_Crit_Llama-2-13b", 'wb') as fp:
          pickle.dump(all_eval_results_df, fp)
          print("Done")

Done
