### code from https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/rag-intro.ipynb

In [1]:
# !pip install openai

In [2]:
from openai import OpenAI
client  = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama'
)

In [3]:
# !pip install elasticsearch

In [4]:
from elasticsearch import Elasticsearch

In [5]:
es_client = Elasticsearch('http://localhost:9200')

In [6]:
es_client.info()

ObjectApiResponse({'name': '795b86a922a7', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'JekZ29cgTtSE2y1VOxOvCg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [22]:
index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name= "course-questions"

print(f"Index {index_name} created.")
# Delete the index if it exists
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)
    print(f"Index {index_name} deleted.")

# Create a new index
es_client.indices.create(index=index_name, body=index_setting)


Index course-questions created.


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [23]:
es_client

<Elasticsearch(['http://localhost:9200'])>

In [24]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [25]:
from tqdm.auto import tqdm

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:27<00:00, 34.99it/s]


In [27]:
# rag: augmentation 
def build_prompt(query,search_results):
    prompt_template="""
You are a teaching assistant, Please answer the QUESTION based on facts from the CONTEXT, 
If CONTEXT doesnot have the facts, Please answer with NONE.

QUESTION: {question}

CONTEXT: {context}
""".strip()
    context =""
    for doc in search_results:
        context =context+f"section: {doc['section']} \nquestion: {doc["question"]}\n answer: {doc["text"]} \n\n"
    prompt = prompt_template.format(question=query , context=context).strip()
    return prompt 


In [28]:
# !pip install -U "ai21>=2.13.0"

In [29]:
# rag: generation 
from ai21 import AI21Client
from ai21.models.chat import UserMessage

# One way of passing your key to the client.
import os
AI21_API_KEY = os.environ["AI21_API_KEY"]
client = AI21Client(api_key=AI21_API_KEY)

def single_message_instruct(content):
    messages = [UserMessage(content=content)]
    response = client.chat.completions.create(
        model="jamba-1.5-large",
        messages=messages,
        top_p=1.0 # Setting to 1 encourages different responses each call.
    )
    return response.to_json()

def llm(prompt):
    response = single_message_instruct(prompt)
    json_response = json.loads(response)
    content = json_response["choices"][0]["message"]["content"]
    return content

In [30]:
import json
def elastic_search(query):
    search_query= {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
    }

    response= es_client.search(index=index_name,body=search_query)
    results_docs =[]
    for hit in response["hits"]["hits"]:
        results_docs.append(hit["_source"])
    return results_docs
query = "The course has already started can i still enroll?"

elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [32]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer
    
query ="I just discovered the course. Can i still join it?"
rag(query)

  return response.to_json()


"Yes, you can still join the course after the start date. You can submit the homeworks even if you haven't registered. However, be mindful of the deadlines for the final projects."