### Dependencies and minsearch

In [1]:
# Install dependencies
!pip install -U transformers accelerate bitsandbytes sentencepiece



In [2]:
# Get self-built minsearch
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-09 08:34:38--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-09 08:34:38 (57.1 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [3]:
import requests
import minsearch

# URL of the JSON file containing course documents
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'

# Make a GET request to the provided URL to retrieve the JSON data
docs_response = requests.get(docs_url)

# Parse the response as JSON
documents_raw = docs_response.json()

# Initialize an empty list to hold the processed documents
documents = []

# Loop through each course in the raw documents data
for course in documents_raw:
    course_name = course['course']  # Extract the course name

    # Loop through each document in the course
    for doc in course['documents']:
        doc['course'] = course_name  # Add the course name to the document
        documents.append(doc)  # Append the document to the documents list

# Create an index for searching, specifying text and keyword fields
index = minsearch.Index(
    text_fields=["question", "text", "section"],  # Fields to search through for text content
    keyword_fields=["course"]  # Fields to search through for keywords
)

# Fit the index with the processed documents
index.fit(documents)


<minsearch.Index at 0x797ccd105db0>

### For reference: Standard pipeline functions using minsearch and OpenAI

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [5]:
# check  disk usage
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         202G   42G  160G  21% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G     0  5.7G   0% /dev/shm
/dev/root       2.0G  1.2G  820M  59% /usr/sbin/docker-init
tmpfs           6.4G  4.1M  6.4G   1% /var/colab
/dev/sda1       242G   87G  156G  36% /kaggle/input
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware


### Google FLAN T5

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [7]:
# Use GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
print(f"{device} will be used for computation.")

cuda will be used for computation.


In [8]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl",
                                                #    device_map="auto"
                                                   )
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 2048)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 2048)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=2048, out_features=2048, bias=False)
              (k): Linear(in_features=2048, out_features=2048, bias=False)
              (v): Linear(in_features=2048, out_features=2048, bias=False)
              (o): Linear(in_features=2048, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
              (wi_1): Linear(in_features=2048, out_features=5120, bias=False)
       

In [9]:
input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

In [12]:
outputs = model.generate(input_ids)
result = tokenizer.decode(outputs[0])



In [13]:
result

'<pad> Wie alt sind Sie?</s>'

In [27]:
# update functions for pipeline

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):

    # Parameters for text generation
    generate_params = {
    "max_length": 512,
    "num_beams": 5,
    "do_sample": False,
    "temperature": 1.0,
    "top_k": 50,
    "top_p": 0.95
    }

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids,
                             max_length=generate_params.get("max_length", 512),
                             num_beams=generate_params.get("num_beams", 5),
                             do_sample=generate_params.get("do_sample", False),
                             temperature=generate_params.get("temperature", 1.0),
                             top_k=generate_params.get("top_k", 50),
                             top_p=generate_params.get("top_p", 0.95),
                             early_stopping=True)
    # hard coded version: outputs = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [28]:
query = "I just discovered the course, can I still join it?"
rag(query)



"Yes, even if you don't register, you're still eligible to submit the homeworks."