In [9]:
!pip install ragas



In [10]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate
import torch

from langchain_core.embeddings import Embeddings
from typing import List
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import TokenTextSplitter, Tokenizer, TextSplitter

from transformers import BitsAndBytesConfig
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

In [77]:
EMBD_MODEL = "sentence-transformers/all-mpnet-base-v2"
#EMBD_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
#GEN_MODEL = "meta-llama/Llama-2-13b-chat-hf"
GEN_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
AUTH_TOKEN = "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"
DEVICE = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

In [78]:
class CustomHuggingfaceEmbeddings(Embeddings):
    def __init__(self, model_id, api_key, **model_kwargs):
        super().__init__()
        self.model_name = model_id
        self.api_key = api_key
        if isinstance(model_kwargs, dict):
            self.model_config = model_kwargs
        
        self.InitializeModel()
        
        
    def InitializeModel(self):
        # Load model from HuggingFace Hub
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name,
                                              config = self.model_config)
        self.model.to(DEVICE)
        print("Model on CUDA: ", str(next(self.model.parameters()).is_cuda))
    #Mean Pooling - Take attention mask into account for correct averaging
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
    def embed_query(self, text:str, mean_pooling: bool = True) -> List[float]:
        return self.embed_documents([text], mean_pooling)[0]
    
    def embed_documents(self, texts: List[str], mean_pooling: bool = True) -> List[List[float]]:
        """
        Pitfalls: It's important to pass texts, that fits in the max_sequenz_length of the model to avoid index error.
        """
        
        #print("Tokenizer on CUDA: ", str(next(self.tokenizer.parameters()).is_cuda))
        embeddings = []
        with torch.no_grad():
            for text in texts:
                #print("Text: ", text)
                text_tokens = self.tokenizer(text, return_tensors='pt', add_special_tokens=False).to(DEVICE)
                #print("text_tokens: ", len(text_tokens["input_ids"][0]))
                #print(text)
                #print("Len of attention mask: ", str(len(text_tokens["attention_mask"][0])))
                #print()
                embedding = self.model(**text_tokens)

                if mean_pooling:
                    embedding = self.mean_pooling(embedding.copy(), text_tokens['attention_mask'])

                embeddings.append(embedding[0].cpu().tolist())
            #print(embeddings[:2])
        return embeddings

    def split_text_on_tokens(self, *, text: str, tokenizer: Tokenizer) -> List[str]:
        """Split incoming text and return chunks using tokenizer."""
        splits: List[str] = []
        input_ids = tokenizer.encode(text)
        start_idx = 0
        cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
        chunk_ids = input_ids[start_idx:cur_idx]
        while start_idx < len(input_ids):
            splits.append(tokenizer.decode(chunk_ids))
            if cur_idx == len(input_ids):
                break
            start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
            cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
            chunk_ids = input_ids[start_idx:cur_idx]
        return splits

In [79]:
model_id = GEN_MODEL
auth_token = AUTH_TOKEN
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    output_hidden_states=True,
    use_auth_token=auth_token,
)


# Load model
auth_token = "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"  # The authorization code is insible to the public
gpt_tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth_token, device = "auto")
gpt_model = AutoModelForCausalLM.from_pretrained(model_id,
                                            trust_remote_code=True,
                                            config=model_config,
                                            quantization_config=bnb_config,
                                            device_map="auto",
                                            use_auth_token=auth_token
                                            )

generation_pipeline = transformers.pipeline(
    model=gpt_model,
    tokenizer=gpt_tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=1e-8,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generation_pipeline)



Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [65]:
embeddings_model = CustomHuggingfaceEmbeddings(model_id = EMBD_MODEL, 
                                           api_key = AUTH_TOKEN, 
                                           model_kwargs = {"device": DEVICE})

  return self.fget.__get__(instance, owner)()


Model on CUDA:  True


In [66]:
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
generator_llm = LangchainLLMWrapper(llm)
embeddings_ragas = LangchainEmbeddingsWrapper(embeddings_model)

In [7]:
from datasets import load_dataset

# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [67]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
import pickle
import numpy as np

test_log = None
with open("test_log", 'rb') as fp:
    test_log = pickle.load(fp)

In [68]:
test_log[0]["query"]

'Wie hat Leverkusen am 2023-11-25 in der 1.Bundesliga gespielt?'

In [69]:
len(test_log)

8

In [70]:
eval_list = [
    {
        "question": log["query"],
        "contexts": [doc.page_content for doc, _ in log["context"]],
        "answer": log["inference"],
        
    }
    for log in test_log
]

In [71]:
datasets.Dataset.from_pandas(pd.DataFrame(data=eval_list))

Dataset({
    features: ['question', 'contexts', 'answer'],
    num_rows: 8
})

In [72]:
dataset = pd.DataFrame(data=eval_list)
dataset

Unnamed: 0,question,contexts,answer
0,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[],Thanks for asking! Leverkusen has played agai...
1,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[leverkusen hat die tabellenfuhrung in der fuß...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...
2,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[und weil niklas sule wegen einer grippe fehlt...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...
3,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[],Thanks for asking! Leverkusen has played agai...
4,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[leverkusen hat die tabellenfuhrung in der fuß...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...
5,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[leverkusen hat die tabellenfuhrung in der fuß...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...
6,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[und weil niklas sule wegen einer grippe fehlt...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...
7,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[leverkusen hat die tabellenfuhrung in der fuß...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...


In [73]:
dataset["contexts"] = dataset.apply(lambda x: x["contexts"] if len(x["contexts"]) > 0 else np.nan, axis = 1)
dataset = dataset.dropna()
dataset.reset_index(inplace=True, drop=True)



In [74]:
ev = datasets.Dataset.from_pandas(dataset[:1])

In [80]:
results = evaluate(
                    ev, 
                    metrics=[answer_relevancy, faithfulness], 
                    llm=generator_llm, 
                    embeddings=embeddings_ragas
                )

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

Invalid JSON response. Expected dictionary with key 'question'
  value = np.nanmean(self.scores[cn])


In [82]:
df = results.to_pandas()
df.head()

Unnamed: 0,question,contexts,answer,answer_relevancy,faithfulness
0,Wie hat Leverkusen am 2023-11-25 in der 1.Bund...,[leverkusen hat die tabellenfuhrung in der fuß...,Leverkusen hat am 2023-11-25 in der 1.Bundesl...,,


In [37]:
del results