In [1]:
# use documents.json (also located in notebooks folder)

import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

# Generate unique IDs for documents

In [5]:
# Option 1 
n = len(documents)

for i in range(n): 
    documents[i]['id'] = i 
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': 1}

In [7]:
# Note, this is not stable as numbers will change once documents are updated !

In [8]:
# Option 2 ; generate ID based on content 

import hashlib
# library is used to generate a hash (a fixed-size string of characters) from the content of the document.

def generate_document_id(doc):
    """
    function combines specific fields from the document to form a string:
    combines the course, the question, and the first 10 characters of the text field

    combined string is encoded and passed to hashlib.md5() to generate an MD5 hash object
    The MD5 hash function produces a 128-bit hash value, which is a 32-character hexadecimal number    
    """
    
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    # converts the hash object to a hexadecimal string
    hash_hex = hash_object.hexdigest()
    # The first 8 characters of this hash are taken as the unique document ID
    document_id = hash_hex[:8]
    return document_id

In [9]:
# apply function 
for doc in documents:
    doc['id'] = generate_document_id(doc)
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [10]:
# verify and check for duplicates 

from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)
    
len(hashes), len(documents)

# note: if the lenghts differ, there might be duplcicates

(947, 948)

In [13]:
# filter for these specific duplicates 

for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [15]:
# inspect and verify duplicates
# ideally, these shall be removed 

hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [18]:
# export documents with ids to json for further processing 

import json

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)
    
!head documents-with-ids.json


!head documents-with-id.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",
head: documents-with-id.json: No such file or directory


# Use LLM to generate potential user questions 

In [24]:
prompt_template =     """
    You emulate a student who's taking our course.
    Formulate 5 questions this student might ask based on a FAQ record. The record
    should contain the answer to the questions, and the questions should be complete and not too short.
    If possible, use as fewer words as possible from the record. 
    
    The record:
    
    section: {section}
    question: {question}
    answer: {text}
    
    Provide the output in parsable JSON without using code blocks:
    
    ["question1", "question2", ..., "question5"]
    """.strip()

In [22]:
# set up openai api 

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the API key
api_key = os.getenv('OPENAI_API_KEY')

from openai import OpenAI
client = OpenAI()

In [26]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [29]:
# use openai api to generate possible user questions for all the documents

from tqdm.auto import tqdm # this wont work in jupyter lab 
from tqdm import tqdm

results = {}

for doc in tqdm(documents, desc="Processing documents"): 
    doc_id = doc['id']

    # to handle duplicates and potential kernel downtime 
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

Processing documents: 100%|██████████████████████████████████████████████████████████████████████████████| 948/948 [29:28<00:00,  1.87s/it]


In [31]:
results

{'c02e79ef': '[\n    "What is the start date and time for the course?",\n    "How can I access the live \'Office Hours\' session at the beginning of the course?",\n    "Is there a specific platform I need to subscribe to for course updates?",\n    "What is the registration process before the course begins?",\n    "Where can I find the Telegram channel for course announcements?"\n]',
 '1f6520ca': '[\n    "What should I have experience with before enrolling in this course?",\n    "Are there any specific skills or tools I need to know for this course?",\n    "Can you list the required background knowledge for this course?",\n    "What foundational knowledge is expected from students taking this course?",\n    "Is there a reference link for understanding the necessary prerequisites for this course?"\n]',
 '7842b56a': '[\n    "Is it possible to join the course after it has already started?",\n    "What happens if I miss the registration deadline for the course?",\n    "Am I allowed to submi

# Parse results

In [32]:
# load generated user questions ie results 

import pickle

with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

FileNotFoundError: [Errno 2] No such file or directory: 'results.bin'

In [None]:
# verify and inspect an example 
results['1f6520ca']

In [None]:
# parse results to json 

parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

parsed_results

In [None]:
# add the index 
doc_index = {d['id']: d for d in documents}

In [None]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [None]:
# safe to csv 
import pandas as pd 

df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.to_csv('ground-truth-data.csv', index=False)
!head ground-truth-data.csv


In [None]:
# inspect "ground truth" dataset 
pd.read_csv('ground-truth-data.csv').head()