In [1]:
!rm -rf minsearch.py
!wget https://raw.githubusercontent.com/yakhyo/llm-practice/main/01-intro/minsearch.py

--2024-07-02 09:29:01--  https://raw.githubusercontent.com/yakhyo/llm-practice/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3889 (3.8K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-02 09:29:02 (26.2 MB/s) - ‘minsearch.py’ saved [3889/3889]



In [2]:
import requests
import minsearch

docs_url = (
    "https://github.com/yakhyo/llm-practice/blob/main/01-intro/documents.json?raw=1"
)
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

In [3]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

## Create an search index and parse all the documents to it

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7377a945cfe0>

In [5]:
def search_query(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5,
    )

    return results

In [6]:
def build_prompt(query, search_results):
    prompt_template = """
    QUESTION: {question}

    CONTEXT: {context}
    """.strip()

    context = ""
    for doc in search_results:
        context = context + f"{doc['question']}\n{doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt


def llm(prompt):
    response = generator(
        prompt, max_length=500, temperature=0.7, top_p=0.95, num_return_sequences=1
    )
    response_final = response[0]["generated_text"]
    return response_final[len(prompt) :].strip()

In [7]:
def rag_system(query):
    search_results = search_query(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [8]:
! nvidia-smi

Tue Jul  2 09:29:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   53C    P8              17W /  72W |      0MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [9]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
HF_KEY = os.getenv("HF_KEY")

login(token=HF_KEY)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/yakhyo/.cache/huggingface/token
Login successful


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer


model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
)
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1", padding_side="left"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [12]:
rag_system("I just discovered the course. Can I still join?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Course - What is the difference between the course and the bootcamp?\nThe course is a free version of the bootcamp.\nThe bootcamp is a 10-weeks program with a lot of live sessions, mentorship, and a lot of support.\nThe course is a self-paced version of the bootcamp.'