### Dependencies and minsearch

In [2]:
# Install dependencies
!pip install -U transformers accelerate bitsandbytes sentencepiece



In [3]:
# Get self-built minsearch
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-09 12:31:35--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-09 12:31:36 (59.2 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [4]:
import requests
import minsearch

# URL of the JSON file containing course documents
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'

# Make a GET request to the provided URL to retrieve the JSON data
docs_response = requests.get(docs_url)

# Parse the response as JSON
documents_raw = docs_response.json()

# Initialize an empty list to hold the processed documents
documents = []

# Loop through each course in the raw documents data
for course in documents_raw:
    course_name = course['course']  # Extract the course name

    # Loop through each document in the course
    for doc in course['documents']:
        doc['course'] = course_name  # Add the course name to the document
        documents.append(doc)  # Append the document to the documents list

# Create an index for searching, specifying text and keyword fields
index = minsearch.Index(
    text_fields=["question", "text", "section"],  # Fields to search through for text content
    keyword_fields=["course"]  # Fields to search through for keywords
)

# Fit the index with the processed documents
index.fit(documents)


<minsearch.Index at 0x7f29e7428490>

### For reference: Standard pipeline functions using minsearch and OpenAI

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [5]:
# check  disk usage
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         202G   31G  171G  16% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G     0  5.7G   0% /dev/shm
/dev/root       2.0G  1.2G  820M  59% /usr/sbin/docker-init
/dev/sda1       242G   84G  159G  35% /kaggle/input
tmpfs           6.4G   20M  6.4G   1% /var/colab
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware


### Mistral 7B Instruct

In [8]:
import torch, os
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
torch.random.manual_seed(0)

<torch._C.Generator at 0x7f29e5f7a810>

In [6]:
# Create token on Huggingface and store it in Colab secrets, then load token and login
from google.colab import userdata
token = userdata.get('huggingface_token_llm_zoomcamp')
from huggingface_hub import login
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
# # this is not necessary if device_map="auto" is set in model
# # Use GPU if available
# if torch.cuda.is_available():
#     device = torch.device("cuda")
# print(f"{device} will be used for computation.")

# model.to(device)

In [9]:
# Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # ensures the model is moved to your GPU(s)
    load_in_4bit=True   # applies 4-bit dynamic quantization to massively reduce the resource requirements
)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [18]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [24]:
def build_prompt(query, search_results):
    prompt_template = """
    QUESTION: {question}

    CONTEXT:
    {context}

    ANSWER:
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"{doc['question']}\n{doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = generator(prompt, max_length=1024, temperature=0.7, top_p=0.95, num_return_sequences=1)
    response_final = response[0]['generated_text']
    return response_final[len(prompt):].strip()

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
query = "I just discovered the course, can I still join it?"
print(rag(query))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Yes, you can still join the course after the start date.
    The exact day and hour of the course will be 15th Jan 2024 at 17h00.
    You can start by installing and setting up all the dependencies and requirements:
    Google cloud account
    Google Cloud SDK
    Python 3 (installed with Anaconda)
    Terraform
    Git
    Look over the prerequisites and syllabus to see if you are comfortable with these subjects.
    You can contribute to the course by starring the repo, sharing it with friends, and creating a PR if you see you can improve the text or the structure of the repository.
