## Base code for module-2
Basically a condensed version of what we learned in Module 1:

index document for RAG into a minsearch object

In [None]:
# !rm -f minsearch.py
# !wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
import os
if not os.path.isfile('minsearch.py'):
    !python -m wget "https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
else:
    print("minsearch.py already exists")

In [None]:
import requests 
import minsearch

In [None]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

## Replace the OpenAI LLM with HuggingFace open-source model Microsoft Phi3

### Important Note: If you're not running in Saturn Cloud 

You need to install these libraries:

Make sure you use the latest versions

```pip install -U transformers accelerate bitsandbytes```

By default, the tokenizers are loaded into a default location specified under env variable HF_HOME, usually it's HF_HOME = /home/\<your username\>.

However on Saturn Cloud, you may not have enough space in your home directory. To check on how much space, use ```!df -h```

As per Module 2.3, we will switch the HF_HOME env variable to "/run/cache" as there is more space there.

In [None]:
import os
os.environ['HF_HOME'] = '/run/cache/'
# equivalent to this terminal cmd: "export HF_HOME='/run/cache' "

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# set seed for reproducibility
torch.random.manual_seed(0)

### Tokenizer and LLM

The Tokenizer takes in text and turn it into some representation, and then the representation is fed into the language model.

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

### Putting everything together

Replace the original prompt and llm functions with the Phi3 model

In [None]:
# def build_prompt(query, search_results):
#     prompt_template = """
# You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
# Use only the facts from the CONTEXT when answering the QUESTION.

# QUESTION: {question}

# CONTEXT: 
# {context}
# """.strip()

#     context = ""
    
#     for doc in search_results:
#         context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
#     prompt = prompt_template.format(question=query, context=context).strip()
#     return prompt

# def llm(prompt):
#     response = client.chat.completions.create(
#         model='gpt-4o',
#         messages=[{"role": "user", "content": prompt}]
#     )
    
#     return response.choices[0].message.content

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    messages = [
        {"role": "user", "content": prompt},
    ]

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    return output[0]['generated_text'].strip()

In [None]:
rag("I just discovered the course. Can I still join it?")