### code from https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/rag-intro.ipynb

In [1]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-09-19 13:47:34--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.2’


2024-09-19 13:47:34 (5.43 MB/s) - ‘minsearch.py.2’ saved [3832/3832]



In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents =[]
for docs in docs_raw:
    for doc in  docs["documents"]: 
        doc["course"] =docs["course"]
        documents.append(doc)

In [5]:
documents[0].keys()

dict_keys(['text', 'section', 'question', 'course'])

In [6]:
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [7]:
index.fit(documents)

<minsearch.Index at 0x76ce4ee834d0>

In [8]:
query = "The course has already started can i still enroll?"

filter_dict = {"course": "data-engineering-zoomcamp"}
boost_dict = {"question": 3, "section": 0.5}

results = index.search(
    query,
    filter_dict, 
    boost_dict
)

for result in results:
    print(result)

{'text': "Yes, even if you don't register, you're still eligible to submit.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, we will keep all the materials after the course /finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'}
{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nGit\nLook

In [9]:
prompt_template="""
You are a teaching assistant, Please answer the QUESTION based on facts from the CONTEXT, 
If CONTEXT doesnot have the facts, Please answer with NONE.

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [10]:
from ai21 import AI21Client
from ai21.models.chat import UserMessage

# One way of passing your key to the client.
import os
AI21_API_KEY = os.environ["AI21_API_KEY"]
client = AI21Client(api_key=AI21_API_KEY)

def single_message_instruct(content):
    messages = [UserMessage(content=content)]
    response = client.chat.completions.create(
        model="jamba-1.5-large",
        messages=messages,
        top_p=1.0 # Setting to 1 encourages different responses each call.
    )
    return response.to_json()


In [11]:
response = single_message_instruct(query)

  return response.to_json()


In [12]:
import json
json_response = json.loads(response)
json_response["choices"][0]["message"]["content"]

'Yes, you can still enroll in the course. The enrollment period is still open, and you can register for the course at any time.'

In [13]:
def something_new(content):
    response = single_message_instruct(content)
    json_response = json.loads(response)
    content = json_response["choices"][0]["message"]["content"]
    return content

In [14]:
print(something_new(query))

Whether you can still enroll in a course that has already started depends on the policies of the institution or organization offering the course. Here are some general steps you can take to find out:

1. **Check the Course Website**: Look for information on enrollment deadlines and policies on the course's official website.


2. **Contact the Registrar or Admissions Office**: If it's an academic institution, the registrar or admissions office can provide specific details about late enrollment.


3. **Reach Out to the Course Instructor**: Sometimes, the course instructor can give permission for late enrollment, especially if the course is not full.


4. **Look for Online Forums or Groups**: If it's a popular course, there might be online forums or social media groups where you can ask current or past students about their experiences with late enrollment.


5. **Visit the Institution**: If possible, visit the institution in person to inquire about enrollment options.


If you provide mor

  return response.to_json()


In [15]:
context =""
for doc in results:
    context =context+f"section: {doc['section']} \nquestion: {doc["question"]}\n answer: {doc["text"]} \n\n"
    

In [16]:
prompt = prompt_template.format(question=query , context=context).strip()

In [17]:
print(prompt)

You are a teaching assistant, Please answer the QUESTION based on facts from the CONTEXT, 
If CONTEXT doesnot have the facts, Please answer with NONE.

QUESTION: The course has already started can i still enroll?

CONTEXT: section: General course-related questions 
question: Course - Can I still join the course after the start date?
 answer: Yes, even if you don't register, you're still eligible to submit.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute. 

section: General course-related questions 
question: Course - Can I follow the course after it finishes?
 answer: Yes, we will keep all the materials after the course /finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project. 

section: General course-related questions 
question: Co

In [18]:
print(something_new(prompt))

Yes, even if you don't register, you're still eligible to submit.
Be aware, however, that there will be deadlines for turning in the final projects.


  return response.to_json()


# modularization

In [19]:
# rag: retrieval 
def search(query):    
    filter_dict = {"course": "data-engineering-zoomcamp"}
    boost_dict = {"question": 3, "section": 0.5}
    
    results = index.search(
        query,
        filter_dict, 
        boost_dict
    )
    return results
search_results = search("The course has already started can i still enroll?")

In [20]:
# rag: augmentation 
def build_prompt(query,search_results):
    prompt_template="""
You are a teaching assistant, Please answer the QUESTION based on facts from the CONTEXT, 
If CONTEXT doesnot have the facts, Please answer with NONE.

QUESTION: {question}

CONTEXT: {context}
""".strip()
    context =""
    for doc in search_results:
        context =context+f"section: {doc['section']} \nquestion: {doc["question"]}\n answer: {doc["text"]} \n\n"
    prompt = prompt_template.format(question=query , context=context).strip()
    return prompt 
prompt = build_prompt(query,search_results)

In [21]:
# rag: generation 
from ai21 import AI21Client
from ai21.models.chat import UserMessage

# One way of passing your key to the client.
import os
AI21_API_KEY = os.environ["AI21_API_KEY"]
client = AI21Client(api_key=AI21_API_KEY)

def single_message_instruct(content):
    messages = [UserMessage(content=content)]
    response = client.chat.completions.create(
        model="jamba-1.5-large",
        messages=messages,
        top_p=1.0 # Setting to 1 encourages different responses each call.
    )
    return response.to_json()

def llm(prompt):
    response = single_message_instruct(prompt)
    json_response = json.loads(response)
    content = json_response["choices"][0]["message"]["content"]
    return content
llm(prompt)


  return response.to_json()


"Yes, even if you don't register, you're still eligible to submit. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [22]:
query = "How to run kafka?"
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer
rag(query)

  return response.to_json()


"To run Kafka, follow these steps:\n\n1. **Create a Virtual Environment**:\n\n\n  * Run `python -m venv env` to create a virtual environment.\n  * Activate the virtual environment using `source env/bin/activate` on MacOS and Linux or `env/Scripts/activate` on Windows.\n  * Install the necessary packages using `pip install -r ../requirements.txt`.\n\t\n2. **Run Kafka in Docker**:\n\n\n  * Ensure all Docker images are up and running before creating the virtual environment.\n\t\n3. **Run Java Kafka**:\n\n\n  * In the project directory, run `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`.\n\t\n4. **Fix ModuleNotFoundError**:\n\n\n  * Use `pip install kafka-python-ng` instead of `kafka-python`.\n\t\n5. **Run Python as a Startup Script**:\n\n\n  * Redefine the python environment variable to that of your user account.\n\t\n6. **Fix BigQuery Error in dbt run**:\n\n\n  * Convert columns to the correct data type using `df[col] = df[col].astype('I

In [23]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked\ntechnical question\nThe next cohort starts in Jan 2025. More inFo at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

# elastic search

In [24]:
from elasticsearch import Elasticsearch

In [25]:
es_client = Elasticsearch('http://localhost:9200')

In [26]:
es_client.info()

ObjectApiResponse({'name': '5d6af4ae929f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'JdHYfsE_RdqUN_ILnu18sA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [27]:
index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name= "course-questions"
es_client.indices.create(index=index_name,body=index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [28]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031/1031 [00:26<00:00, 38.60it/s]


In [38]:
query = "The course has already started can i still enroll?"

In [39]:
search_query= {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}


In [40]:
response= es_client.search(index=index_name,body=search_query)

In [41]:
results_docs =[]
for hit in response["hits"]["hits"]:
    results_docs.append(hit["_source"])


In [42]:
filter_results = []
temp={}
for result in results_docs:
    temp["text"] =result["text"]
    temp["question"] =result["question"] 
    filter_results+=[temp]

In [43]:
filter_results

[{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?'},
 {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?'},
 {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'question': 'Course - I 

In [44]:
def elastic_search(query):
    search_query= {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
    }

    response= es_client.search(index=index_name,body=search_query)
    results_docs =[]
    for hit in response["hits"]["hits"]:
        results_docs.append(hit["_source"])
    return results_docs

elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course /finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and conti

In [45]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer
    
query = "The course has already started can i still enroll?"
rag(query)

  return response.to_json()


"Yes, even if you don't register, you're still eligible to submit.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."