<a href="https://www.kaggle.com/code/tarekyahia/generative-question-answer-system-with-haystack?scriptVersionId=168285325" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### besmi allah

## Libraries 

In [None]:
# install haystack
!pip install --upgrade pip

!pip install \
peft \
evaluate==0.4.0 \
rouge_score==0.1.2 \

!pip install 'farm-haystack[all]' ## or 'all-gpu' for the GPU-enabled dependencies


In [None]:
import os
import numpy as np 
import pandas as pd
import re


from haystack.utils import convert_files_to_docs
from haystack.nodes import PreProcessor
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import Pipeline
from haystack.nodes.sampler import TopPSampler
from haystack.nodes.ranker import LostInTheMiddleRanker

from evaluate import load
metric = load("rouge")

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer,AutoTokenizer
import torch
from peft import PeftModel, PeftConfig


import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)



## Knowledge Base :

In [None]:

BOOK_DIR = '/kaggle/input/mlbookspdfs' # to the books

# show the books 
for dirname, _, filenames in os.walk(BOOK_DIR):
    print('-------------------------------------------')
    for filename in filenames:
        print(filename)

## Preprocessing

- PDFs to TXT and cleaning

In [None]:
def clean_text(text: str) -> str:
    """
    
   Note: The source code for the function is a combination of clean_wiki_text (available in haystack) and some custom cleanup.
   
    """
    # get rid of multiple new lines
    while "\n\n" in text:
        text = text.replace("\n\n", "\n")

    # remove extremely short lines
    lines = text.split("\n")
    cleaned = []
    for l in lines:
        if len(l) > 30 or (l[:2] == "==" and l[-2:] == "=="):
            cleaned.append(l)
    text = "\n".join(cleaned)

    # add paragraphs (identified by wiki section title which is always in format "==Some Title==")
    text = text.replace("\n==", "\n\n\n==")

    # remove empty paragrahps
    text = re.sub(r"(==.*==\n\n\n)", "", text)
    
    # custom
    
    # Remove "Table of Contents"
    text = re.sub(r'Table of Contents', '', text, flags=re.IGNORECASE)  
    
    # Remove patterns like 4\x0c
    text = re.sub(r'\b\d*\x0c\b', '', text)  
    
    # Remove patterns like /n1, and /n
    text = re.sub(r'(\\n|\n)\d+|/n', '', text)  # Remove patterns like /n1, /nNUM, and /n
    
    # Replace newline characters with spaces that between words
    text = text.replace('\n', '')
    
    # remove #\u200b
    text = text.replace('\u200b', '')  
    

    return text

In [None]:
#all_docs = convert_files_to_docs(dir_path="/kaggle/input/ng-machinelearningyearning", clean_func=clean_text)

all_docs = convert_files_to_docs(dir_path=BOOK_DIR,clean_func=clean_text)

In [None]:
print(len(all_docs))

- split to chunks and some more cleaning

In [None]:
processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    remove_substrings=None,
    split_by="word",
    split_length=100, # suitable for the dense vector
    split_respect_sentence_boundary=True,
    split_overlap= 4,
    max_chars_check = 10_000,
    progress_bar = True
)
docs = processor.process(all_docs)

In [None]:
print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

In [None]:
# look at example
sample = docs[5]
sample
# 4\x0c
# assume\nthat

##  retrieval
    - using FAISS DB to work more with embedings

In [None]:
document_store = FAISSDocumentStore(sql_url="sqlite:///", faiss_index_factory_str="Flat")

# write the docs to the DB
document_store.write_documents(docs)

In [None]:
retriever = EmbeddingRetriever(
    document_store=document_store, 
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    max_seq_len = 256,
    batch_size = 32,
    top_k = 10
)
# Important:
# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
document_store.update_embeddings(retriever)


## Generative

In [None]:
model_name = 'google/flan-t5-base' #'facebook/bart-large-cnn'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype = torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name)


## RAG Integration

In [None]:
# pipeline


def model_predict(model,prompt):
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)#, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def RAG_pipeline(question):

    pipeline = Pipeline()
    pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
    pipeline.add_node(component=TopPSampler(top_p=0.90), name="Sampler", inputs=["Retriever"])
    pipeline.add_node(component=LostInTheMiddleRanker(1024), name="LostInTheMiddleRanker", inputs=["Sampler"])
    return pipeline.run(query=question)
    
    
prompts = []    
    
def RAG_Predict(model, question):
 
    result = RAG_pipeline(question)
    
    context = ""
    for d in result['documents']:
        context += d.content
    context
    
    prompt = f"""Answer the following question based on the context: \n Question: {question} \n Context: \n {context}  \n Answer: """
    prompts.append(prompt)
    
    return model_predict(model,prompt)


## test zero shot

In [None]:
# look at the output before finetuning
questions = [
    "What is machine learning?",
    "What is the difference between supervised and unsupervised learning?",
    "What is a neural network?",
    "What is the purpose of activation functions in neural networks?",
    "What is overfitting in machine learning?",
    "What is gradient descent?",
    "What is the difference between classification and regression in machine learning?",
    "What is a convolutional neural network (CNN) used for?",
    "What is transfer learning in deep learning?",
    "What is the purpose of regularization techniques in machine learning?"
]

In [None]:
# 10 questions

# taking answers
answers = []

for question in questions:
    
    answer = RAG_Predict(model,question)
    answers.append(answer)
    
 # df   
base_answers = pd.DataFrame({'questions' : questions, 'answers' : answers})

pd.set_option('display.max_colwidth', None)
base_answers

## evaluate:

In [None]:
# chat gpt answers -> i used the same prompt with the same context
# to evaluate with rouge and semantic simalarty

chatGPT_answers = [
    "Machine learning is about designing algorithms that automatically extract valuable information from data. It emphasizes automatic extraction of meaningful information from data without much domain-specific expertise, aiming for general-purpose methodologies applicable to various datasets.",
    "Supervised learning involves observing several examples of a random vector x and an associated value or vector y, and learning to predict y from x. In contrast, unsupervised learning involves observing several examples of a random vector x without associated values and attempting to implicitly or explicitly learn the probability distribution p(x) or some interesting properties of that distribution.",
    "Neural networks, specifically feedforward networks, are composed of interconnected functions represented by a directed acyclic graph. They are organized into layers, with each layer being a function of the preceding one. Hidden unit design and determining the architecture are key considerations in neural network design.",
    "The activation function in neural networks, such as the rectified linear unit (ReLU), is applied element-wise to the output of linear transformations. It introduces nonlinearity to the network, enabling it to learn complex patterns and relationships in the data.",
    "Overfitting in machine learning occurs when a model memorizes the training data excessively, leading to poor performance on unseen data (test set). It happens when the model has high capacity and fits the noise in the training data rather than capturing the underlying patterns.",
    "Gradient descent is a first-order optimization algorithm used to find a local minimum of a function by taking steps proportional to the negative gradient of the function at the current point. Stochastic gradient descent is an extension that uses a subset of training examples in each iteration to approximate the gradient.",
    "Classification involves predicting discrete labels or categories for input data, while regression involves predicting continuous values. In classification, the labels are typically integers, whereas in regression, the labels are real-valued.",
    "Convolutional neural networks (CNNs) are specialized neural networks for processing data with a known grid-like topology, such as time-series or image data. They utilize convolution, a specialized kind of linear operation, to extract features from the input data efficiently.",
    "Transfer learning in deep learning involves leveraging knowledge learned from one task or domain to improve performance on another related task or domain. It can reduce the amount of labeled data needed for training and enhance generalization to new tasks.",
    "Regularization techniques in machine learning are designed to reduce generalization error (error on unseen data) without significantly increasing training error. They include adding constraints or penalties on model parameters to prevent overfitting and improve model performance on unseen data. Regularization is crucial for controlling the complexity of models and preventing overfitting."
]


# rouge function

def eval_rouge(refs,preds):
    
    result = metric.compute(predictions=preds, references=refs, use_stemmer=True, use_aggregator=False)
    
    return result['rouge1'],np.mean(result['rouge1'])
print(eval_rouge(chatGPT_answers,answers))

In [None]:
# similarty 
from sentence_transformers import SentenceTransformer, util

def cos_sim(refs,preds):
    

    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Encode all sentences
    refs_embd = model.encode(refs)
    preds_embd = model.encode(preds)

    # Compute cosine similarity between all pairs
    cos_sim = util.cos_sim(refs_embd, preds_embd)
    
    return cos_sim



In [None]:
#instructed model

# similarity
sim = []
for r,q in zip(chatGPT_answers,answers):
    sim.append(float(cos_sim(r,q)[0, 0]))
print(f" the average of the similarity =  {np.mean(sim)}")


rouges,ave = eval_rouge(chatGPT_answers,answers)

print(f" the average of the rouge scores is  =  {ave} \n the rouge scores = {rouges}")

## LoRa Fine tune

[fine-tune notebook](https://www.kaggle.com/code/tarekyahia/fine-tune-flan-t5-question-answer-squad/notebook)

In [None]:
# load

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       '/kaggle/input/peft-flan-t5/checkpoint-29982', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [None]:
answers_fine = []

for question in questions:
    answer = RAG_Predict(peft_model,question)
    answers_fine.append(answer)
    
# 10 questions

fine_answers = pd.DataFrame({'questions' : questions, 'answers' : answers_fine})

fine_answers

In [None]:
print(eval_rouge(chatGPT_answers,answers_fine))

In [None]:
# fine tuned model

# similarity
sim = []
for r,q in zip(chatGPT_answers,answers_fine):
    sim.append(float(cos_sim(r,q)[0, 0]))
print(f" the average of the similarity =  {np.mean(sim)}")


rouges,ave = eval_rouge(chatGPT_answers,answers)

print(f" the average of the rouge scores is  =  {ave} \n the rouge scores = {rouges}")

> the peft and zero shot instructed model get the same resoults.! the fine tune has no improvements i think if the dataset is very related then will do some improvement.