In [1]:
import os
import requests

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents_raw[0]['documents']

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines 

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [12]:
# from collections import defaultdict

# hashes = defaultdict(list)

# for doc in documents:
#     doc_id = doc['id']
#     hashes[doc_id].append(doc)

In [14]:
# len(hashes), len(documents)

(947, 948)

In [15]:
# for k, values in hashes.items():
#     if len(values) > 1:
#         print(k, len(values))

593f7569 2


In [16]:
# hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [17]:
# import json

In [18]:
# with open('documents-with-ids.json', 'wt') as f_out:
#     json.dump(documents, f_out, indent=2)

In [19]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [6]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [9]:
from dotenv import load_dotenv
import openai
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI()

In [10]:
def generate_question(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [11]:
from tqdm.auto import tqdm

In [12]:
results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_question(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [13]:
import pickle

In [17]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

EOFError: Ran out of input

In [26]:
results

{'c02e79ef': '{\n    "question1": "When does the course begin?",\n    "question2": "At what time will the course start on 15th Jan 2024?",\n    "question3": "How can I subscribe to the course\'s Google Calendar?",\n    "question4": "Where can I register for the course?",\n    "question5": "How do I join the course\'s Telegram channel?"\n}',
 '1f6520ca': '["What are the prerequisites for the course?", "Where can I find the course prerequisites?", "Can you tell me the requirements to enroll?", "Is there a list of prerequisites for this course?", "Where should I check for prerequisite information?"]',
 '7842b56a': '[\n    "Can I still join the course once it has already started?",\n    "Am I eligible to submit homework if I don\'t register before the start date?",\n    "Are there deadlines for final project submissions?",\n    "Is it possible to join the course after it begins and still meet deadlines?",\n    "What happens if I leave everything for the last minute?"\n]',
 '0bbf41ec': '[\n

In [22]:
import json

In [27]:
parsed_results = {}

for doc_id, json_questions in results.items():
    print(json_questions)
#     parsed_results[doc_id] = json.loads(json_questions)

{
    "question1": "When does the course begin?",
    "question2": "At what time will the course start on 15th Jan 2024?",
    "question3": "How can I subscribe to the course's Google Calendar?",
    "question4": "Where can I register for the course?",
    "question5": "How do I join the course's Telegram channel?"
}
["What are the prerequisites for the course?", "Where can I find the course prerequisites?", "Can you tell me the requirements to enroll?", "Is there a list of prerequisites for this course?", "Where should I check for prerequisite information?"]
[
    "Can I still join the course once it has already started?",
    "Am I eligible to submit homework if I don't register before the start date?",
    "Are there deadlines for final project submissions?",
    "Is it possible to join the course after it begins and still meet deadlines?",
    "What happens if I leave everything for the last minute?"
]
[
  "When can I expect a confirmation email after registering for the Data Engin

In [28]:
with open('response.json', 'w', encoding='utf-8') as f_in:
    json.dump(results, f_in, ensure_ascii=False, indent=4)