In [None]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
!pip install transformers accelerate bitsandbytes

In [None]:
!nvidia-smi

In [None]:
import requests
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

# video 2.4 - phi 3 mini

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    messages = [
        {"role": "user", "content": prompt},
    ]

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    return output[0]['generated_text'].strip()

In [None]:
rag("I just discovered the course. Can I still join it?")

# video 2.5 - mistral-7B

In [None]:
import os
from google.colab import userdata
from huggingface_hub import login

In [None]:
import torch
from transformers import pipeline

torch.random.manual_seed(0)

In [None]:
login(token=userdata.get('HF_TOKEN'))

In [None]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")

In [None]:
# Use a pipeline as a high-level helper
pipe = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")

### ran out of GPU ram :((((

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
QUESTION: {question}

CONTEXT:
{context}

ANSWER:
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"{doc['question']}\n{doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = pipe(prompt, max_length=500, temperature=0.7, top_p=0.95, num_return_sequences=1)
    response_final = response[0]['generated_text']
    return response_final[len(prompt):].strip()

# video 2.7 - ollama, running llms on a CPU

first install ollama 

```sh
brew install --cask ollama
```

then start server

```sh
ollama start
```

then pull a model

```sh
ollama pull phi3
```

In [1]:
from openai import OpenAI

In [2]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama'
)

In [5]:
def llm(prompt):
    res = client.chat.completions.create(
        model='phi3',
        messages=[{'role': 'user', 'content': prompt}]
    )
    return res.choices[0].message.content

In [6]:
llm('write that this is a test')

' This is a test.\n\n\nTo execute and verify the functionality of your system, run an automated process which includes: \n\n- Initiating with "This is a test." to simulate typical operations.\n\n- Monitoring for expected outcomes such as correct text display without errors.\n\n- Logging results into a testing report that details execution and observations, ensuring transparency in the automated process.'

Let's now run it in docker

```sh
docker run -it \
    -v ollama:/root/.ollama \
    -p 11434:11434 \
    --name ollama \
    ollama/ollama
```

remember to pull the model again inside the container, otherwise it'd have no model to use!
(this is why we mount a volume, to persist the downloads)

In [8]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama'
)

In [12]:
llm('write the following sentence and nothing else: "this is a test"')

' This is a test.\n'

# video 2.8 - ollama and phi3 + elasticsearch in docker-compose

First we start by creating a docker-compose file, running ollama and elastic search:

```yaml
services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.4.3
    container_name: elasticsearch
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
    ports:
      - '9200:9200'
      - '9300:9300'

  ollama:
    image: ollama/ollama
    container_name: ollama
    volumes:
      - ollama:/root/.ollama
    ports:
      - '11434:11434'    

volumes:
  ollama:
```

Now we run the same thing from module 1 but with Ollama + ElasticSearch

In [14]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [15]:
from elasticsearch import Elasticsearch

In [16]:
es_client = Elasticsearch('http://localhost:9200') 

In [17]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'course-questions'}

In [18]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [19]:
from tqdm import tqdm

In [21]:
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

100%|██████████████████████████████████████████████████████| 948/948 [00:08<00:00, 117.23it/s]


In [22]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [23]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [24]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
query = 'I just disovered the course. Can I still join it?'
rag(query)

" Yes, you are still eligible to join after discovering your course of choice in our Data Engineering Bootcamp even if it has already started since there will be a deadline for final projects submission and we keep all materials available post-course completion as stated in question 2 answer section from the FAQ database. However, consider reviewing prerequisites, syllabus (question3), understanding course structure beforehand can greatly improve your learning experience which might help you get a head start with homework submissions even without receiving an initial confirmation email(Question1). While registration isn't compulsory and doesn't serve as formal acceptance it may provide certain benefits like direct access to FAQ (question4) and other resources, making the learning process smoother. You can begin your journey by installing necessary software dependencies immediately upon discovering about this course(Question2). Lastly note that you still have time before starting date o

# video 2.9 UI for RAG

In [27]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting pillow<11,>=7.1.0 (from streamlit)
  Downloading pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting protobuf<6,>=3.20 (from streamlit)
  Downloading protobuf-5.27.2-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Downloading rich-13.7.1-py3-none-any.whl.metadata

code in `streamlit_app.py`