In [1]:
# Install required libraries
!pip install pytorch torchvision torchaudio
!pip install transformers==4.30
!pip install langchain sentence_transformers huggingface-hub
!pip install pypdf
!pip install -U langchain-community
!pip install bitsandbytes
!pip install faiss-cpu langchain-openai tiktoken unstructured selenium newspaper3k textstat
!pip install accelerate

Collecting pytorch
  Downloading pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1->torchvision)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1->torchvision)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1->torchvision)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.1->torchvision)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.1->torchvision)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==

In [2]:
from google.colab import drive, userdata
import os
import pickle
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [3]:
# Mount Google Drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/MyDrive/WAI_project/"

Mounted at /content/gdrive


In [4]:
# Load documents
loader = DirectoryLoader(f"{root_dir}", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [5]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [6]:
# Set HF token
hf_token = userdata.get('HF_TOKEN')
os.environ["HF_TOKEN"] = hf_token

In [7]:
# Load model and tokenizer with quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained("google/gemma-1.1-7b-it", quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-1.1-7b-it", quantization_config=quantization_config)

config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [8]:
# Text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)
def ingest_and_chunk_pdfs():
    loader = DirectoryLoader(f"{root_dir}", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    # Debug: Print the number of documents loaded
    print(f"Number of documents loaded: {len(documents)}")
    for i, doc in enumerate(documents[:3]):
        print(f"Document {i+1}:")
        print(doc.page_content[:500])

    texts = text_splitter.split_documents(documents)

    # Debug: Print the number of text chunks created
    print(f"Number of text chunks: {len(texts)}")
    for i, text in enumerate(texts[:3]):
        print(f"Chunk {i+1}:")
        print(text.page_content[:500])
    return texts

texts = ingest_and_chunk_pdfs()

Number of documents loaded: 645
Document 1:
Centre Number Candidate NumberWrite your name here
Surname Other names
Total MarksPaper Reference
Turn over     Pearson 
Edexcel GCSE
*P44728A0112* P44728A
©2015 Pearson Education Ltd.
4/1/1/1Instructions
• Use black  ink or ball-point pen.• Fill in the boxes  at the top of this page with your name,  
 centre number and candidate number.• Answer all questions.•  Answer the questions in the spaces provided  
– there may be more space than you need .
Information
• The total mark for this paper is 
Document 2:
2*P44728A0212*Answer ALL questions.
1 Study Section 1 (pages 2, 3 and 4) of the Resource Booklet and answer the following 
questions. 
 (a) (i) Define the term aquifer .
(2)
............................................................................................................................... ............................................................................................................................... .............

In [9]:
def generate_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [10]:
def store_embeddings(docs, store_name, path):
    embeddings = generate_embeddings([doc.page_content for doc in docs])
    vector_store = FAISS.from_documents(docs, embeddings)
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "wb") as f:
        pickle.dump(vector_store, f)

def load_embeddings(store_name, path):
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "rb") as f:
        vector_store = pickle.load(f)
    return vector_store


In [11]:
def retrieve_relevant_chunks(question, vector_store, num_chunks=5):
    docs = vector_store.similarity_search(question, k=num_chunks)
    return docs

In [12]:
def format_prompt(question, chunks):
    context = "\n".join([chunk.page_content for chunk in chunks])
    prompt = f"Provide an answer to the following question using only the context provided: {question}? " \
             f"If you cannot answer this question from the information provided, respond with 'There is insufficient information to answer this question.'\n\n{context}"
    return prompt

def gen_answer(prompt, max_length=200, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    response = model.generate(inputs, max_new_tokens=max_length, temperature=temperature)
    answer = tokenizer.decode(response[0], skip_special_tokens=True)
    return answer.strip()

In [13]:
def main(question):
    # Ensure embeddings are stored
    store_embeddings(texts, "embedding_store", root_dir)

    # Load the vector store
    vector_store = load_embeddings("embedding_store", root_dir)

    # Retrieve relevant chunks based on the question
    relevant_chunks = retrieve_relevant_chunks(question, vector_store)

    # Format the prompt for the LLM
    prompt = format_prompt(question, relevant_chunks)

    # Generate the answer using the LLM
    answer = gen_answer(prompt)
    return answer

In [14]:
# Example usage
question = "What are the main causes of climate change?"
answer = main(question)
print(answer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.01 GiB. GPU 