<a href="https://colab.research.google.com/github/women-in-ai-ireland/June-2024-Group-002/blob/main/test_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install pytorch torchvision torchaudio
!pip install transformers==4.30
!pip install langchain sentence_transformers huggingface-hub
!pip install pypdf
!pip install -U langchain-community
!pip install bitsandbytes
!pip install faiss-cpu langchain-openai tiktoken unstructured selenium newspaper3k textstat
!pip install accelerate

!pip install langchain-huggingface
!pip install sentence-transformers==2.2.2
!pip install InstructorEmbedding

In [2]:
from google.colab import drive, userdata
import os
import pickle
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [3]:
# Mount Google Drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/MyDrive/WAI_project/"

Mounted at /content/gdrive


In [4]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cpu


In [5]:
# Set HF token
hf_token = userdata.get('HF_TOKEN')

In [None]:
# load instructor embeddings model
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

In [7]:
# set local path to store embeddings ***replace with SingleStore, AWS or similar
embedding_store_path = f"{root_dir}/embedding_store"

In [8]:
# defines the parameters to use the recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 10,
    length_function = len,
)

In [9]:
def ingest_and_chunk_pdfs():
    loader = DirectoryLoader(f"{root_dir}", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    texts = text_splitter.split_documents(documents)

    return texts

In [10]:
# Function to store embeddings
def store_embeddings(docs, embeddings, store_name, path):
    """
    Stores embeddings in FAISS format and saves to a pickle file.

    Args:
    - docs (list): List of documents.
    - embeddings: Embedding model.
    - store_name (str): Name of the embedding store.
    - path (str): Path to the directory where embeddings will be stored.
    """
    vector_store = FAISS.from_documents(docs, embeddings)
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "wb") as f:
        pickle.dump(vector_store, f)

In [11]:
# Function to load embeddings
def load_embeddings(store_name, path):
    """
    Loads embeddings from a pickle file.

    Args:
    - store_name (str): Name of the embedding store.
    - path (str): Path to the directory where embeddings are stored.

    Returns:
    - vector_store: Loaded FAISS vector store.
    """
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "rb") as f:
        vector_store = pickle.load(f)
    return vector_store

In [12]:
# Function to initialize Hugging Face Instruct Embeddings
def initialize_huggingface_embeddings(model_name="hkunlp/instructor-xl", device="cuda"):
    """
    Initializes Hugging Face Instruct Embeddings model.

    Args:
    - model_name (str): Name of the Hugging Face model.
    - device (str): Device to run the model on.

    Returns:
    - embeddings: Initialized Hugging Face Instruct Embeddings model.
    """
    return HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs={"device": device})


In [13]:
def retrieve_relevant_chunks(question, vector_store, num_chunks=1):
    docs = vector_store.similarity_search(question, k=num_chunks)
    return docs

In [14]:
def format_prompt(question, chunks):
    context = "\n".join([chunk.page_content for chunk in chunks])
    prompt = f"Provide an answer to the following question using only the context provided: {question}? " \
             f"If you cannot answer this question from the information provided, respond with 'There is insufficient information to answer this question.'\n\n{context}"
    return prompt

def gen_answer(prompt, max_length=100, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    response = model.generate(inputs, max_new_tokens=max_length, temperature=temperature)
    answer = tokenizer.decode(response[0], skip_special_tokens=True)
    return answer.strip()

In [15]:
def main(question):

  # initialize embedding model
  instructor_embeddings = initialize_huggingface_embeddings()

  # read and chunk data
  texts = ingest_and_chunk_pdfs()

  # create and store embeddings
  store_embeddings(texts,
                  instructor_embeddings,
                  store_name='instructEmbeddings',
                  path=embedding_store_path)

  # Load the vector store
  vector_store = load_embeddings(store_name='instructEmbeddings',
                                      path=embedding_store_path)

  # Load model and tokenizer with quantization
  quantization_config = BitsAndBytesConfig(load_in_4bit=True)
  model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", quantization_config=quantization_config)
  tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", quantization_config=quantization_config, model_max_length=256)


  # Retrieve relevant chunks based on the question
  relevant_chunks = retrieve_relevant_chunks(question, vector_store)

  # Format the prompt for the LLM
  prompt = format_prompt(question, relevant_chunks)

  # Generate the answer using the LLM
  answer = gen_answer(prompt)
  return answer


In [None]:
# Example usage
question = "What are the main causes of climate change?"
answer = main(question)
print(answer)