### Imports

In [None]:
pip install -U faiss-gpu-cu12 --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
pip install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m460.8/480.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from datasets import load_dataset
import faiss
from sentence_transformers import SentenceTransformer

### Embedding Generation

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


def create_faiss_index(documents, encoder_model):
  # Encode the documents into vectors
  document_vectors = encoder_model.encode(documents)

  # Create a FAISS index and add the vectors
  dimension = document_vectors.shape[1]  # Dimensionality of the embeddings
  faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean distance)
  faiss_index.add(document_vectors)

  return faiss_index, documents


def retrieve_documents(query, faiss_index, documents, encoder_model, top_k=3):
  # Encode the query into a vector
  query_vector = encoder_model.encode([query])

  # Search the FAISS index for the top-k nearest neighbors
  _, indices = faiss_index.search(query_vector, top_k)

  # Retrieve the corresponding documents
  retrieved_docs = [documents[idx] for idx in indices[0]]

  return retrieved_docs

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Initiallizing the Model

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
try:
  # if the code block ran before, this will be executed
  print(type(model))
except:
  # this is the first time, so we will go ahead and create the model
  !pip install accelerate bitsandbytes --quiet
  !pip install --upgrade transformers --quiet

  import torch
  from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  from accelerate import infer_auto_device_map, init_empty_weights

  model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
  config = AutoConfig.from_pretrained(model_name)

  # load model tokenizer
  tokenizer = AutoTokenizer.from_pretrained(
      model_name,
      trust_remote_code = True,
  )

  with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)

  # For effective GPU usuage
  device_map = infer_auto_device_map(model, max_memory={0: "20GiB", "cpu": "20GiB", "disk": "0GiB"})

  # load model
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      trust_remote_code = True,
      device_map = device_map,
  )
  model.eval()



### Creating the RAG System

In [None]:
from datasets import Dataset
from datasets import load_dataset
import pandas as pd

# Load the dataset from CSV
#dataset = Dataset.from_csv("/content/Query_response_with_responses.csv")
dataset_papers = load_dataset("scientific_papers", "arxiv")
column_names = ['Unnamed: 0', 'article', 'abstract', 'section_names', 'query', 'response']


README.md:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

scientific_papers.py:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

The repository for scientific_papers contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/scientific_papers.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [None]:
dataset_papers_df = pd.DataFrame(dataset_papers['train'])
dataset_papers_df = dataset_papers_df.sample(n=50000)
articles = dataset_papers_df["article"].tolist()

# Applying
faiss_index, articles = create_faiss_index(articles, embedding_model)


In [None]:
faiss.write_index(faiss_index, "papers_index.faiss")

In [None]:
faiss_index = faiss.read_index("/content/papers_index.faiss")

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer  = WordNetLemmatizer()


def preprocess_text(text):
  if not isinstance(text, str):
    return ""

  text = text.lower()
  text = re.sub(r'[^a-z0-9\s]', '', text)
  words = text.split()
  words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
  return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def get_relevant_documents_for_question(question, faiss_index, documents, encoder_model, top_k=3):
  encoded_question = encoder_model.encode([question])
  relevant_docs = retrieve_documents(question, faiss_index, documents, encoder_model, top_k)
  context = ""
  for article in relevant_docs:
    context += " " + preprocess_text(article)
  return context

def get_answers_from_llm(question, tokenizer, model, articles):
  # Define user and system prompts
  user_prompt = f"""
      As the user, I am providing a question and extracted articles to guide your response. Your role is to answer the question using only the information provided in the articles, without adding any unrelated or speculative content. End your response after answering the question, and do not generate any additional text.

      Extracted articles:

      {articles}

      Question: {question}
  """

  system_prompt = """
      As the system, your role is to generate a concise and accurate response to the user's question by strictly referencing the extracted articles provided. Refrain from including any information not present in the articles, and do not generate any additional content beyond the answer. Your response must end immediately after answering the question.
  """

  # Combine prompts
  prompt = f"""
      {system_prompt}

      {user_prompt}

      Answer:
  """

  # Tokenize the prompt
  token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")

  # Generate output using the model
  output_ids = model.generate(
      token_ids.to(model.device),
      max_new_tokens=200,
  )

  # Decode the model's output
  output = tokenizer.decode(output_ids[0][token_ids.size(1):])

  # Return the output
  return output


def rag_system(question, faiss_index, documents, encoder_model, tokenizer, model, top_k=3):
  relevant_documents = get_relevant_documents_for_question(question, faiss_index, documents, encoder_model, top_k=top_k)
  answer = get_answers_from_llm(question, tokenizer, model, relevant_documents)
  return answer

In [32]:
relevant_documents = get_relevant_documents_for_question("What is the best way to shave?", faiss_index, articles, embedding_model)

In [33]:
print(relevant_documents)

 different context number physical application involving hairy black hole emerged last year instance asymptotically anti de sitter ad black hole endowed scalar field related superconductors mean gravity gauge duality xcite additionally totally different area effort towards testing hair theorem astronomical observation recently developed xcite extensive literature hairy black hole broad application confirm physical relevance three dimensional gravity fruitful arena quantum gravity including baados teitelboim zanelli btz black hole xcite also generous providing exact black hole dressed scalar field first example xcite characterized scalar field regular everywhere three dimensional scalar hairy black hole reported emphasis microscopic computation entropy xcite see also xcite result algorithm determine stationary circularly symmetric solution xcite result represent small part considerable attention three dimensional scalar hairy black hole received recent year see instance xcite reference 

### RAG without Finetuning

In [None]:
import csv

questions = [
  "How do stripped-envelope supernovae contribute to the synthesis of heavy elements in the universe?",
  "What role does the Herschel Space Observatory play in advancing our understanding of star formation?",
  "What observational techniques are most effective for identifying exoplanets with conditions suitable for liquid water?",
  "How do particle size and shape affect flow dynamics in granular material systems?",
  "What impact do periodic potentials have on the efficiency of molecular transport systems?",
  "How do shifts in quasar spectral lines provide evidence for the expansion of the universe?",
  "What factors influence the stability of superconducting currents in magnetic field environments?"
]

# Open a CSV file to write the questions and answers
with open('rag_responses_without_finetuning.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Answer"])

    # Loop through each question
    for i, question in enumerate(questions):
        print(f"Q{i+1}: {question}\n")
        answer = rag_system(question, faiss_index, articles, embedding_model, tokenizer, model, top_k=3)
        print(f"A{i+1}: {answer}\n")
        print("-" * 100)

        # Write the question and answer to the CSV
        writer.writerow([question, answer])

print("Questions and answers have been saved to 'rag_responses_without_finetuning_qwen.csv'.")

Q1: How do stripped-envelope supernovae contribute to the synthesis of heavy elements in the universe?

A1:  Stripped-envelope supernovae (SEs) are highly energetic events that can release significant amounts of energy into the interstellar medium. These supernovae contribute to the synthesis of heavy elements in the universe through several key processes:

1. **Energy Release**: SEs release vast quantities of energy. This energy can be converted into kinetic energy of particles and thermal energy of the surrounding gas.

2. **Particle Production**: The intense radiation emitted by SEs causes particle collisions with the interstellar medium. These collisions can create new elements and compounds.

3. **Thermal Energy**: The energy released by SEs can also heat up the surrounding gas, which further contributes to the production of heavier elements.

4. **Stellar Evolution**: The energy released by SEs can fuel the star's nuclear reactions, leading to the formation of heavier elements du

In [None]:
del model

import gc
gc.collect

import torch
torch.cuda.empty_cache()

### RAG with finetuning

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct")
model = PeftModel.from_pretrained(base_model, "yugdave/qwen-1.5b-finetuned-query-response")
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code = True,
)


# For effective GPU usuage
device_map = "auto"

# load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code = True,
    device_map = device_map,
)
model.eval()

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/8.75M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [None]:
import csv

questions = [
  "How do stripped-envelope supernovae contribute to the synthesis of heavy elements in the universe?",
  "What role does the Herschel Space Observatory play in advancing our understanding of star formation?",
  "What observational techniques are most effective for identifying exoplanets with conditions suitable for liquid water?",
  "How do particle size and shape affect flow dynamics in granular material systems?",
  "What impact do periodic potentials have on the efficiency of molecular transport systems?",
  "How do shifts in quasar spectral lines provide evidence for the expansion of the universe?",
  "What factors influence the stability of superconducting currents in magnetic field environments?"
]


# Open a CSV file to write the questions and answers
with open('rag_responses_with_finetuning_qwen.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Answer"])

    # Loop through each question
    for i, question in enumerate(questions):
        print(f"Q{i}: {question}\n")
        answer = rag_system(question, faiss_index, articles, embedding_model, tokenizer, model, top_k=3)
        print(f"A{i}: {answer}\n")
        print("-" * 100)

        # Write the question and answer to the CSV
        writer.writerow([question, answer])

print("Questions and answers have been saved to 'rag_responses_with_finetuning_qwen.csv'.")

Q0: How do stripped-envelope supernovae contribute to the synthesis of heavy elements in the universe?



The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


A0:  stripped-envelope supernovae contribute to the synthesis of heavy elements in the universe through their ability to expel heavy elements from their host stars. These supernovae release a large amount of energy and material, which are then expelled from the star's envelope into space. This process is known as stellar stripping, where the outer layers of the star are stripped away and expelled into space, leaving behind a core that retains most of the star's mass and composition. The stripped envelope supernovae can also release heavy elements as they are expelled from the star's core, further contributing to the synthesis of heavy elements in the universe. Additionally, the expelled material can collide with other objects in space, such as planets or asteroids, and release more heavy elements through nuclear fusion reactions. Overall, stripped-envelope supernovae play a critical role in the formation and evolution of heavy elements in the universe, and continue to be an active area

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import infer_auto_device_map, init_empty_weights
from peft import PeftModel

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

base_model = AutoModelForCausalLM.from_pretrained("unsloth/mistral-7b-v0.3-bnb-4bit")
model = PeftModel.from_pretrained(base_model, "yugdave/mistral-7b-finetuned-query-response")

# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code = True,
)

# For effective GPU usuage
device_map = "auto"

# load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code = True,
    device_map = device_map,
)
model.eval()

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
     

In [None]:
import csv

questions = [
  "How do stripped-envelope supernovae contribute to the synthesis of heavy elements in the universe?",
  "What role does the Herschel Space Observatory play in advancing our understanding of star formation?",
  "What observational techniques are most effective for identifying exoplanets with conditions suitable for liquid water?",
  "How do particle size and shape affect flow dynamics in granular material systems?",
  "What impact do periodic potentials have on the efficiency of molecular transport systems?",
  "How do shifts in quasar spectral lines provide evidence for the expansion of the universe?",
  "What factors influence the stability of superconducting currents in magnetic field environments?"
]


# Open a CSV file to write the questions and answers
with open('rag_responses_with_finetuning_mistral.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Answer"])

    # Loop through each question
    for i, question in enumerate(questions):
        print(f"Q{i}: {question}\n")
        answer = rag_system(question, faiss_index, articles, embedding_model, tokenizer, model, top_k=3)
        print(f"A{i}: {answer}\n")
        print("-" * 100)

        # Write the question and answer to the CSV
        writer.writerow([question, answer])

print("Questions and answers have been saved to 'rag_responses_without_finetuning_mistral.csv'.")

### Evaluation

In [14]:
!pip install rouge-score --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [15]:
import nltk
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Load the datasets
original_dataset = pd.read_csv('/content/Query_response_with_responses.csv')
rag_dataset_without_finetuning = pd.read_csv('/content/rag_responses_without_finetuning_qwen.csv')
rag_dataset_with_finetuning = pd.read_csv('/content/rag_responses_with_finetuning_qwen.csv')

# Ensure the questions are aligned
original_dataset.rename(columns={"query": "Question", "response": "Reference"}, inplace=True)
rag_dataset_without_finetuning.rename(columns={"Answer": "Generated_without_FT"}, inplace=True)
rag_dataset_with_finetuning.rename(columns={"Answer": "Generated_with_FT"}, inplace=True)

# Merge datasets based on Question
merged_data_initial = pd.merge(original_dataset, rag_dataset_without_finetuning, on="Question")
merged_data = pd.merge(merged_data_initial, rag_dataset_with_finetuning, on="Question")


In [20]:
merged_df = pd.read_csv("/content/Filtered_Merged_Dataset_with_Common_Questions.csv")

In [18]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [25]:

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate metrics
results = []
bleu_scores, meteor_scores, rouge1_scores, rouge2_scores, rougeL_scores = [], [], [], [], []

for _, row in merged_data.iterrows():
    reference = row["Reference"]
    generated_without_FT = row["Generated_without_FT"]
    generated_with_FT = row["Generated_with_FT"]

    # Tokenize for BLEU
    reference_tokens = reference.split()
    generated_with_FT_tokens = generated_with_FT.split()

    # BLEU
    bleu = sentence_bleu([reference_tokens], generated_with_FT_tokens)
    bleu_scores.append(bleu)

    # METEOR (untokenized)
    meteor = meteor_score([reference_tokens], generated_with_FT_tokens)
    meteor_scores.append(meteor)

    # ROUGE (untokenized)
    rouge_scores = scorer.score(reference, generated_with_FT)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougeL = rouge_scores['rougeL'].fmeasure

    rouge1_scores.append(rouge1)
    rouge2_scores.append(rouge2)
    rougeL_scores.append(rougeL)

    results.append({
        "Question": row["Question"],
        "BLEU": bleu,
        "METEOR": meteor,
        "ROUGE-1": rouge1,
        "ROUGE-2": rouge2,
        "ROUGE-L": rougeL
    })

# Calculate averages
average_metrics = {
    "Average BLEU": sum(bleu_scores) / len(bleu_scores),
    "Average METEOR": sum(meteor_scores) / len(meteor_scores),
    "Average ROUGE-1": sum(rouge1_scores) / len(rouge1_scores),
    "Average ROUGE-2": sum(rouge2_scores) / len(rouge2_scores),
    "Average ROUGE-L": sum(rougeL_scores) / len(rougeL_scores)
}

print(average_metrics)

{'Average BLEU': 0.007747745247321477, 'Average METEOR': 0.23963337182483224, 'Average ROUGE-1': 0.1863767753017096, 'Average ROUGE-2': 0.0467673992471481, 'Average ROUGE-L': 0.1347719882264537}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [28]:
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize lists to store metrics
bleu_scores, meteor_scores, rouge1_scores, rouge2_scores, rougeL_scores, bert_scores = [], [], [], [], [], []

# Iterate over the dataset rows
results = []
for _, row in merged_df.iterrows():
    reference = row["Reference"]
    generated_with_FT = row["Generated_with_FT"]

    # Tokenize for BLEU
    reference_tokens = reference.split()
    generated_with_FT_tokens = generated_with_FT.split()

    # BLEU
    bleu = sentence_bleu([reference_tokens], generated_with_FT_tokens)
    bleu_scores.append(bleu)

    # METEOR (untokenized)
    # meteor = meteor_score([reference], generated_with_FT)
    # meteor_scores.append(meteor)

    # ROUGE (untokenized)
    rouge_scores = scorer.score(reference, generated_with_FT)
    rouge1 = rouge_scores['rouge1'].fmeasure
    rouge2 = rouge_scores['rouge2'].fmeasure
    rougeL = rouge_scores['rougeL'].fmeasure
    rouge1_scores.append(rouge1)
    rouge2_scores.append(rouge2)
    rougeL_scores.append(rougeL)

    # BERTScore
    P, R, F1 = bert_score([generated_with_FT], [reference], lang='en', rescale_with_baseline=True)
    bert_f1 = F1.mean().item()
    bert_scores.append(bert_f1)

    # Append results
    results.append({
        "Question": row["Question"],
        "BLEU": bleu,
        "METEOR": meteor,
        "ROUGE-1": rouge1,
        "ROUGE-2": rouge2,
        "ROUGE-L": rougeL,
        "BERTScore": bert_f1
    })

# Calculate averages
average_metrics = {
    "Average BLEU": sum(bleu_scores) / len(bleu_scores),
    #"Average METEOR": sum(meteor_scores) / len(meteor_scores),
    "Average ROUGE-1": sum(rouge1_scores) / len(rouge1_scores),
    "Average ROUGE-2": sum(rouge2_scores) / len(rouge2_scores),
    "Average ROUGE-L": sum(rougeL_scores) / len(rougeL_scores),
    "Average BERTScore": sum(bert_scores) / len(bert_scores)
}

# Display average metrics
print(average_metrics)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

{'Average BLEU': 0.007747745247321477, 'Average ROUGE-1': 0.1863767753017096, 'Average ROUGE-2': 0.0467673992471481, 'Average ROUGE-L': 0.1347719882264537, 'Average BERTScore': 0.18182033699538028}


### User Interface (for Demo)

In [None]:
answer = rag_system("What are quasars?", faiss_index, articles, embedding_model, tokenizer, model, top_k=3)

In [None]:
print(answer)

 Quasars are extremely distant galaxies that emit intense beams of light, primarily in the ultraviolet (UV) and optical regions of the electromagnetic spectrum. They are characterized by their vast distances from Earth and the intense emission of light they release. Quasars serve as powerful tools for astronomers to study the universe and can provide insights into its history, composition, and evolution. Their light can be detected through telescopes and satellite observatories, offering valuable data for astrophysical research. Quasars are believed to have existed since the Big Bang and are considered some of the oldest objects in the universe. The study of quasars has contributed significantly to our understanding of cosmology and the physics of the early universe.<|im_end|>


In [None]:
!pip install gradio --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.2/320.2 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m142.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.2/168.2 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gradio as gd
from typing import List

# Define Gardio interface
def rag_interface(query: str):
    try:
        # Call RAG system
        answer = rag_system(query, faiss_index, articles, embedding_model, tokenizer, model, top_k=3)
        return query, answer
    except Exception as e:
        return query, f"Error: {str(e)}"

# Build UI
app = gd.Interface(
    fn=rag_interface,
    inputs=gd.Textbox(label="Enter your query"),
    outputs=[
        gd.Textbox(label="Query"),
        gd.Textbox(label="Generated Response",lines=10),
    ],
    title="Research Assistant RAG Demonstration",
)

# Launch the app
app.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9d5750381d35d20801.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


