In [1]:
# Install required libraries
!pip install pytorch torchvision torchaudio
!pip install transformers==4.30
!pip install langchain sentence_transformers huggingface-hub
!pip install -U langchain-community
!pip install bitsandbytes
!pip install faiss-cpu langchain-openai tiktoken unstructured selenium newspaper3k textstat
!pip install accelerate

!pip install langchain-huggingface
!pip install sentence-transformers==2.2.2
!pip install InstructorEmbedding


Collecting pytorch
  Downloading pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1->torchvision)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1->torchvision)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1->torchvision)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.1->torchvision)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.1->torchvision)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==

In [2]:
from google.colab import drive, userdata
import os
import pickle
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from torch.cuda.amp import GradScaler, autocast
from langchain.schema import Document

In [3]:
# Mount Google Drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/MyDrive/WAI_project/"

Mounted at /content/gdrive


In [4]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [5]:
# Set HF token
hf_token = userdata.get('HF_TOKEN')

In [7]:
# load instructor embeddings model
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})



load INSTRUCTOR_Transformer
max_seq_length  512


In [8]:
# set local path to store embeddings ***replace with SingleStore, AWS or similar
embedding_store_path = f"{root_dir}/embedding_store"

In [9]:
# defines the parameters to use the recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 10,
    length_function = len,
)

In [10]:
from langchain.schema import Document

def ingest_and_chunk_text():
    text = """
    Climate change
    Adaptation
    Actions taken to adjust to natural events such as climate change, to reduce potential damage,
    limit the impacts, take advantage of opportunities, or cope with the consequences.
    Climate change
    A long-term change in the Earth's average temperature and weather patterns.
    Mitigation
    Action taken to reduce or eliminate the long-term risk to human life and property from
    natural hazards, such as building earthquake-proof buildings or making international
    agreements about carbon reduction targets.
    Orbital changes
    Changes in the pathway of the Earth around the Sun.
    Quaternary period
    The period of geological time from about 2.6 million years ago to the present. It is
    characterised by the appearance and development of humans and includes the Pleistocene
    and Holocene Epochs.
    """
    # Create a Document object with the provided text
    document = Document(page_content=text)

    # Split text into chunks
    texts = text_splitter.split_documents([document])

    return texts


In [11]:
# Function to store embeddings
def store_embeddings(docs, embeddings, store_name, path):
    """
    Stores embeddings in FAISS format and saves to a pickle file.

    Args:
    - docs (list): List of documents.
    - embeddings: Embedding model.
    - store_name (str): Name of the embedding store.
    - path (str): Path to the directory where embeddings will be stored.
    """
    vector_store = FAISS.from_documents(docs, embeddings)
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "wb") as f:
        pickle.dump(vector_store, f)

In [12]:
# Function to load embeddings
def load_embeddings(store_name, path):
    """
    Loads embeddings from a pickle file.

    Args:
    - store_name (str): Name of the embedding store.
    - path (str): Path to the directory where embeddings are stored.

    Returns:
    - vector_store: Loaded FAISS vector store.
    """
    with open(os.path.join(path, f"faiss_{store_name}.pkl"), "rb") as f:
        vector_store = pickle.load(f)
    return vector_store

In [13]:
# Function to initialize Hugging Face Instruct Embeddings
def initialize_huggingface_embeddings(model_name="hkunlp/instructor-xl", device="cuda"):
    """
    Initializes Hugging Face Instruct Embeddings model.

    Args:
    - model_name (str): Name of the Hugging Face model.
    - device (str): Device to run the model on.

    Returns:
    - embeddings: Initialized Hugging Face Instruct Embeddings model.
    """
    return HuggingFaceInstructEmbeddings(model_name=model_name, model_kwargs={"device": device})
instructor_embeddings = initialize_huggingface_embeddings()

load INSTRUCTOR_Transformer
max_seq_length  512


In [14]:
def retrieve_relevant_chunks(question, vector_store, num_chunks=1):
    docs = vector_store.similarity_search(question, k=num_chunks)
    return docs

In [15]:
def format_prompt(question, chunks):
    context = "\n".join([chunk.page_content for chunk in chunks])
    prompt = f"Provide an answer to the following question using only the context provided: {question}? " \
             f"If you cannot answer this question from the information provided, respond with 'There is insufficient information to answer this question.'\n\n{context}"
    return prompt

def initialize_model_and_tokenizer():
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", quantization_config=quantization_config)
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", quantization_config=quantization_config, model_max_length=256)
    return model, tokenizer
"""
def gen_answer(prompt, tokenizer, model, max_length=100, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    response = model.generate(inputs, max_new_tokens=max_length, temperature=temperature)
    answer = tokenizer.decode(response[0], skip_special_tokens=True)
    return answer.strip()
    """
def gen_answer(prompt, tokenizer, model, max_length=100, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        response = model.generate(inputs, max_new_tokens=max_length, temperature=temperature)
    answer = tokenizer.decode(response[0], skip_special_tokens=True)
    return answer.strip()

In [16]:
"""
def main(question):
    # Initialize embedding model
    instructor_embeddings = initialize_huggingface_embeddings()

    torch.cuda.empty_cache()

    # Read and chunk data
    texts = ingest_and_chunk_text()

    # Create and store embeddings
    store_embeddings(texts, instructor_embeddings, store_name='instructEmbeddings', path=embedding_store_path)

    # Load the vector store
    vector_store = load_embeddings(store_name='instructEmbeddings', path=embedding_store_path)

    # Load model and tokenizer with quantization
    model, tokenizer = initialize_model_and_tokenizer()

    # Retrieve relevant chunks based on the question
    relevant_chunks = retrieve_relevant_chunks(question, vector_store)

    # Format the prompt for the LLM
    prompt = format_prompt(question, relevant_chunks)

    # Generate the answer using the LLM
    answer = gen_answer(prompt, tokenizer, model)

    torch.cuda.empty_cache()
    return answer
"""
def main(question):
    torch.cuda.empty_cache()
    texts = ingest_and_chunk_text()
    store_embeddings(texts, instructor_embeddings, store_name='instructEmbeddings', path=embedding_store_path)
    vector_store = load_embeddings(store_name='instructEmbeddings', path=embedding_store_path)
    model, tokenizer = initialize_model_and_tokenizer()
    relevant_chunks = retrieve_relevant_chunks(question, vector_store)
    prompt = format_prompt(question, relevant_chunks)
    answer = gen_answer(prompt, tokenizer, model)
    torch.cuda.empty_cache()
    return answer

In [17]:
# Example usage
question = "What is climate change?"
answer = main(question)
print(answer)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]



Provide an answer to the following question using only the context provided: What is climate change?? If you cannot answer this question from the information provided, respond with 'There is insufficient information to answer this question.'

Climate change
    Adaptation
    Actions taken to adjust to natural events such as climate change, to reduce potential damage,
    limit the impacts, take advantage of opportunities, or cope with the consequences.
    Climate change
    A long-term change in the Earth's average temperature and weather patterns.
    Mitigation
    Action taken to reduce or eliminate the long-term risk to human life and property from the
    potential effects of climate change.
    Adaptation
    The process of adjusting to the effects of climate change.
    Mitigation
    The process of reducing the causes of climate change.
    Adaptation
    The process of adjusting to the effects of climate change.
    Mitigation
    The process of reducing the causes of climat