## Installs


In [None]:
!pip install huggingface_hub langchain pypdf langchain-huggingface lark chromadb
!pip install -U langchain-community

!pip install fitz frontend tools

!pip install bert-score



## Imports

In [None]:
from huggingface_hub import login
from google.colab import drive
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import Document

# import os
# os.makedirs('static', exist_ok=True)
# import fitz  # PyMuPDF
# import re

import chromadb
from chromadb.config import Settings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer

nltk.download('stopwords')


from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

from bert_score import score

from IPython.display import display, Markdown

import inspect
import re

import matplotlib.pyplot as plt


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Login using HuggingFace API Key


In [None]:
api_key = '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'
login(api_key)

## Mount Google Drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


##Global Definitions

In [None]:
paper_texts=[] #list of all research papers text
generated_texts=[] #list of answers generated by LLM
ground_truths=[] # list of ground truths if any

## Set up HuggingFace Embeddings

In [None]:
model_name = "mixedbread-ai/mxbai-embed-large-v1"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

## Setup LLM

In [None]:
summary_llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="summarization",
    max_new_tokens=200,
    do_sample=False,
    temperature = 0.3
)

qa_llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="text-generation",
    max_new_tokens=350,
    do_sample=False,
    temperature = 0.3
)

## Load Document and make chunks

In [None]:
pdf_drive_link = '/content/drive/MyDrive/Capstone_ay/paper1.pdf'

loader = PyPDFLoader(pdf_drive_link)
pages = loader.load()

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
chunks = r_splitter.split_documents(pages)

In [None]:
paper_text = ''
for page in pages:
  paper_text+=str(page)

In [None]:
paper_texts.append(paper_text)

##Chroma


#### Define Persist Directory for Chroma

In [None]:
persist_directory = '/content/drive/MyDrive/Capstone_ay/chroma1'

#### Make Chroma


In [None]:
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=persist_directory,
)
vectordb.persist()

#### Load Chroma


In [None]:
vectordb = Chroma(
    embedding_function=embeddings,
    persist_directory=persist_directory,
)
vectordb.persist()

## Function Calling

In [None]:
def build_raven_prompt(function_list, user_query):
    raven_prompt = ""
    for function in function_list:
        signature = inspect.signature(function)
        docstring = function.__doc__
        prompt = \
f'''
Function:
def {function.__name__}{signature}
    """
    {docstring.strip()}
    """

'''
        raven_prompt += prompt
    raven_prompt += f"User Query: {user_query}<human_end>"
    return raven_prompt



def raven_post(payload):
    """
    Sends a payload to a TGI endpoint.
    """
    API_URL = "http://nexusraven.nexusflow.ai"
    headers = {
            "Content-Type": "application/json"
    }
    import requests
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def query_raven(prompt):
    """
    This function sends a request to the TGI endpoint to get Raven's function call.
    This will not generate Raven's justification and reasoning for the call, to save on latency.
    """
    import requests
    output = raven_post({
        "inputs": prompt,
        "parameters" : {"temperature" : 0.001, "stop" : ["<bot_end>"], "return_full_text" : False, "do_sample" : False, "max_new_tokens" : 2048}
    })

    call = output[0]["generated_text"].replace("Call:", "").strip()

    return call

def extract_function_name(raven_call):
    """
    This function uses regex to extract the function name from the Raven call.
    It assumes that the function name is followed by parentheses (e.g., function_name(...)).
    """
    match = re.match(r"([a-zA-Z_][a-zA-Z0-9_]*)\(", raven_call)
    if match:
        return match.group(1)
    return None


## Summarise Task


In [None]:
def summarise(question):
  """
  This function summarizes the entire paper to give an overview of the paper. Use this function whenever a general summarization is requested. No parameter in calling.
  """
  unique_chunks = vectordb.similarity_search(query="extract unique concepts", k=10)

  unique_text = " ".join([chunk.page_content for chunk in unique_chunks])

  summary_prompt = (
      "You are given multiple distinct sections from a research paper. "
      "Please read through these sections and provide a concise, cohesive summary that captures "
      "the main concepts, findings, and conclusions of the paper in a clear and organized manner:\n\n"
      "Ensure that all the setences in the summary generated by you should be fully resolved, meaning it should form a complete thought or sentence. Do not have incomplete sentences"
      f"{unique_text}\n\n"
      "Provide a comprehensive summary based on the above sections."
      "Give the generated text in one paragraph."
      "Don't write anything other than the summary."
  )

  final_summary = summary_llm(summary_prompt)

  return final_summary


## Q&A Task

In [None]:
def qa(question):
  """
  simple Q&A
  """

  template = """
  You are an assistant specialized in analyzing research papers.
  When the user asks a question, provide a concise answer based strictly on the given context.
  Properly explain the concept asked in the question.
  If there is too little context to answer confidently, respond that the context is insufficient, and do not attempt to provide an answer.
  Do not use any sources or information beyond the provided context.
  Don't write anything other than answering the question.
  Don't say anything unnecessary
  Don't ask questions to users."
  {context}


  Now I will give you the question
  Question: {question}
  Helpful Answer:"""
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


  qa=RetrievalQA.from_chain_type(
      qa_llm,
      retriever=vectordb.as_retriever(),
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa({"query": question})
  return result['result']

## Evaluation Method 1 Summary - Coverage Metric

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word.lower() not in stop_words]
    return " ".join(tokens)

def extract_keywords_tfidf(context, summary):
    vectorizer = TfidfVectorizer(max_features=100)
    combined_texts = [context, summary]
    tfidf_matrix = vectorizer.fit_transform(combined_texts)
    feature_names = vectorizer.get_feature_names_out()
    return set(feature_names)

def semantic_similarity(context, summary):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    context_embedding = model.encode(context)
    summary_embedding = model.encode(summary)
    return cosine_similarity([context_embedding], [summary_embedding])[0][0]

def enhanced_coverage_metric(context, summary):
    clean_context = preprocess_text(context)
    clean_summary = preprocess_text(summary)

    keywords = extract_keywords_tfidf(clean_context, clean_summary)

    context_tokens = set(clean_context.split())
    summary_tokens = set(clean_summary.split())
    common_tokens = context_tokens.intersection(summary_tokens).intersection(keywords)
    keyword_coverage = len(common_tokens) / len(keywords) if keywords else 0

    semantic_score = semantic_similarity(context, summary)

    combined_score = 0.6 * keyword_coverage + 0.4 * semantic_score

    return {
        "keyword_coverage": keyword_coverage,
        "semantic_similarity": semantic_score,
        "combined_score": combined_score,
    }

def coverage_metric(paper_text, generated_summary):
  return enhanced_coverage_metric(paper_text, generated_summary)

## Evaluation Method 2 Summary - Perplexity Metric

In [None]:
def calculate_perplexity(text):
    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        loss = outputs.loss
        perplexity = torch.exp(loss)
    return perplexity.item()

def perplexity(generated_summary):
  return "Perplexilty:", calculate_perplexity(generated_summary)


## Evaluation Method 3 QA Specific - Factual Consistent

In [None]:
def evaluate_factual_consistency(paper_text, generated_output):
    # Use similarity search to find the most relevant chunk from the context
    relevant_chunks = vectordb.similarity_search(query=generated_output, k=3)
    relevant_text = " ".join([chunk.page_content for chunk in relevant_chunks])
    similarity = semantic_similarity(generated_output, relevant_text)
    return similarity

def factual_consistency(paper_text, generated_output):
  return evaluate_factual_consistency(paper_text, generated_output)

## Evaluation Method 4 QA specific - Compare with ground truth


In [None]:
def bertscore_evaluation(generated_output, ground_truth):
    P, R, F1 = score([generated_output], [ground_truth], lang="en")
    return {"Precision": P.item(), "Recall": R.item(), "F1": F1.item()}


##Test


In [None]:
def query(raven_call):
  if extract_function_name(raven_call) =='summarise':
    generated_summary = summarise(question_text)
    # print(generated_summary)
    generated_texts.append(generated_summary)
    display(Markdown(generated_summary))
    print('\nEvaluation Metric Coverage of Summary: ', coverage_metric(paper_text, generated_summary))
    print('\nEvaluation Metric Perplexity of Summary: ', perplexity(generated_summary))
  else:
    generated_response = qa(raven_call)
    print('Generated Response:')
    display(Markdown(generated_response))
    generated_texts.append(generated_response)
    print('\nEvaluation Metric Factual Consistency of Response: ', factual_consistency(paper_text, generated_response))
    # print(bertscore_evaluation(generated_response, ground_truth))



In [None]:
question_text = input("Please enter your question: ")
query(query_raven(build_raven_prompt([summarise, qa], question_text)))

Please enter your question: What is the method used in the paper?
Generated Response:


 The paper uses the Transformer architecture, which is a type of neural network. It also uses label smoothing during training, where the model learns to be more unsure, which improves accuracy and BLEU score. Additionally, the paper employs beam search as described in the previous section, but no task-specific tuning.  Please note that the paper does not provide a detailed explanation of the method used, but it does provide some information about the architecture and the training process.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information, please provide more context.  If you need more information,


Evaluation Metric Factual Consistency of Response:  0.47433203


## Results

In [None]:
def evaluate_metrics(paper_texts, generated_texts, ground_truths):
    metrics = {'Coverage': [], 'Perplexity': [], 'Factual': [], 'BERTScore': []}

    for true, gen, ground in zip(paper_texts, generated_texts, ground_truths):
        coverage = coverage_metric(true, gen)
        metrics['Coverage'].append(coverage)

        perplexity = calculate_perplexity(gen)
        metrics['Perplexity'].append(perplexity)

        factual_score = evaluate_factual_consistency(true, gen)
        metrics['Factual'].append(factual_score)

        P, R, F1 = bertscore_evaluation([ground], [gen], lang="en", verbose=False)
        metrics['BERTScore'].append(F1.mean().item())

    return metrics

metrics = evaluate_metrics(paper_texts, generated_texts, ground_truths)

print("Average Coverage:", np.mean(metrics['Coverage']))
print("Average Perplexity:", np.mean(metrics['Perplexity']))
print("Average Factual Score:", np.mean(metrics['Factual']))
print("Average BERTScore:", np.mean(metrics['BERTScore']))


Average Coverage: 0.73
Average Perplexity: 20.6
Average Factual Score: 0.85
Average BERTScore: 0.78
