<a href="https://colab.research.google.com/github/women-in-ai-ireland/June-2024-Group-002/blob/main/Test3_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Changes:

- Replace model to google/gemma-2b as it's smaller
- Set AutoTokenizer max_length=256
- Remove printing document chunks (just in case is using up memory)
- Ingest and Chunks function: Set path to "{root_dir}/data" (I think that's your path?)
- Reduce text splitter to 500 and overlapping of 50 (just checking if this helps with the memory issue)
- Reduce prompt to 100
- Retrieve relevants chunks funtion: reduce to 3 embeddings

In [None]:
# Install required libraries
!pip install pytorch torchvision torchaudio
!pip install transformers==4.30
!pip install langchain sentence_transformers huggingface-hub
!pip install pypdf
!pip install -U langchain-community
!pip install bitsandbytes
!pip install faiss-cpu langchain-openai tiktoken unstructured selenium newspaper3k textstat
!pip install accelerate

In [2]:
from google.colab import drive, userdata
import os
import pickle
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
# Mount Google Drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/MyDrive/WAI_project/"

In [4]:
# Load documents
loader = DirectoryLoader(f"{root_dir}", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [5]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [6]:
# Set HF token
hf_token = userdata.get('HF_TOKEN')
os.environ["HF_TOKEN"] = hf_token

In [None]:
# Load model and tokenizer with quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", quantization_config=quantization_config, model_max_length=256)



In [None]:
# Text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
)
def ingest_and_chunk_pdfs():
    loader = DirectoryLoader(f"{root_dir}/data", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    # Debug: Print the number of documents loaded
    '''print(f"Number of documents loaded: {len(documents)}")
    for i, doc in enumerate(documents[:3]):
        print(f"Document {i+1}:")
        print(doc.page_content[:500])'''

    texts = text_splitter.split_documents(documents)

    # Debug: Print the number of text chunks created
    '''print(f"Number of text chunks: {len(texts)}")
    for i, text in enumerate(texts[:3]):
        print(f"Chunk {i+1}:")
        print(text.page_content[:500])'''
    return texts

texts = ingest_and_chunk_pdfs()

In [9]:
def generate_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [10]:
def store_embeddings(docs, store_name, path):
    embeddings = generate_embeddings([doc.page_content for doc in docs])
    vector_store = FAISS.from_documents(docs, embeddings)
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "wb") as f:
        pickle.dump(vector_store, f)

def load_embeddings(store_name, path):
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "rb") as f:
        vector_store = pickle.load(f)
    return vector_store


In [11]:
def retrieve_relevant_chunks(question, vector_store, num_chunks=3):
    docs = vector_store.similarity_search(question, k=num_chunks)
    return docs

In [12]:
def format_prompt(question, chunks):
    context = "\n".join([chunk.page_content for chunk in chunks])
    prompt = f"Provide an answer to the following question using only the context provided: {question}? " \
             f"If you cannot answer this question from the information provided, respond with 'There is insufficient information to answer this question.'\n\n{context}"
    return prompt

def gen_answer(prompt, max_length=100, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    response = model.generate(inputs, max_new_tokens=max_length, temperature=temperature)
    answer = tokenizer.decode(response[0], skip_special_tokens=True)
    return answer.strip()

In [13]:
def main(question):
    # Ensure embeddings are stored
    store_embeddings(texts, "embedding_store", root_dir)

    # Load the vector store
    vector_store = load_embeddings("embedding_store", root_dir)

    # Retrieve relevant chunks based on the question
    relevant_chunks = retrieve_relevant_chunks(question, vector_store)

    # Format the prompt for the LLM
    prompt = format_prompt(question, relevant_chunks)

    # Generate the answer using the LLM
    answer = gen_answer(prompt)
    return answer

In [None]:
# Example usage
question = "What are the main causes of climate change?"
answer = main(question)
print(answer)