In [1]:
!rm -rf minsearch.py
!wget https://raw.githubusercontent.com/yakhyo/llm-practice/main/01-intro/minsearch.py

--2024-07-02 08:26:42--  https://raw.githubusercontent.com/yakhyo/llm-practice/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3833 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-02 08:26:42 (58.2 MB/s) - ‘minsearch.py’ saved [3833/3833]



In [2]:
import requests
import minsearch

docs_url = (
    "https://github.com/yakhyo/llm-practice/blob/main/01-intro/documents.json?raw=1"
)
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

In [3]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

## Create an search index and parse all the documents to it

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x71008ad1caa0>

In [5]:
def search_query(query):
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5,
    )

    return results

In [6]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt


def llm(prompt):
    messages = [{"role": "system", "content": prompt}]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    return output[0]["generated_text"].strip()

In [7]:
def rag_system(query):
    search_results = search_query(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [8]:
! nvidia-smi

Tue Jul  2 08:26:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   49C    P0              20W /  72W |      0MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import warnings

# Ignore all warnings from a specific module
warnings.filterwarnings("ignore")


torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
rag_system("I just discovered the course. Can I still join?")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


'You can still join the course even if you discover it after the start date. You are eligible to submit the homeworks, but remember to meet the deadlines for the final projects. The course will start on 15th Jan 2024 at 17h00. You can register before the course starts using the provided link and join the course Telegram channel for announcements.'