In [9]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [10]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [13]:
documents[1]['course']

'data-engineering-zoomcamp'

In [14]:
f"{documents[1]['course']}-{documents[1]['question']}-{documents[1]['text'][:10]}"

'data-engineering-zoomcamp-Course - What are the prerequisites for this course?-GitHub - D'

In [15]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [16]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [17]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [20]:
from collections import defaultdict

In [21]:
hashes =  defaultdict(list)

for doc in documents:
    h = generate_document_id(doc)
    doc['id'] = h
    hashes[h].append(doc)

len(hashes), len(documents)

(947, 948)

In [22]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [23]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k)

593f7569


In [11]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [29]:
import json

In [30]:
 with open('documents-with-ids.json', 'wt') as f_out:
     json.dump(documents, f_out, indent=2)

In [31]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [42]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]

""".strip()



In [43]:
from openai import OpenAI
client = OpenAI()

In [44]:
doc = documents[2]
prompt = prompt_template.format(**doc)

In [45]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [47]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    json_response = response.choices[0].message.content
    return json_response

In [52]:
json_questions = generate_questions(documents[2])

In [53]:
json.loads(json_questions)

['Is it possible to enroll in the course after it has already started?',
 'What happens if I miss the registration deadline for the course?',
 'Can I participate in homework submissions if I join late?',
 'Are there any deadlines I need to consider for final projects?',
 'Should I avoid procrastinating on my course assignments?']

In [54]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [55]:
results = {}

In [56]:
# for doc in tqdm(documents):
#     doc_id = doc['id']
#     if doc_id in results:
#         continue
#     questions = generate_questions(doc)
#     results[doc_id] = questions

100%|███████████████████████████████████| 948/948 [39:40<00:00,  2.51s/it]


In [None]:
results

In [58]:
import pickle

In [60]:
import pickle

# After your loop that fills results:
with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)

In [61]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [87]:
results

{'c02e79ef': '[\n    "What is the specific date and time when the course is set to begin?",\n    "How can I stay updated with course announcements and important dates?",\n    "Is there a registration process required before the course starts?",\n    "What platform should I use to access the course\'s public calendar?",\n    "Where can I find the link to register for the course?"\n]',
 '1f6520ca': '[\n    "What specific skills or knowledge do I need before enrolling in this course?",\n    "Can you point me to where I can find the requirements for this course?",\n    "Are there any prior courses or experiences necessary for joining this program?",\n    "Is there a resource that outlines the prerequisites for this course?",\n    "What should I have completed before I start this course?"\n]',
 '7842b56a': '[\n    "Is it possible to enroll in the course after it has begun?",\n    "If I miss the registration, can I still participate in homework assignments?",\n    "Are there any specific dea

In [91]:
#print(results['c02e79ef'])

In [92]:
#results['f476a606']

In [93]:
#print(results['e41b100c'])

In [94]:
import json
import re

parsed_results = {}

for doc_id, json_questions in results.items():
    print(doc_id)
    cleaned = re.sub(r',\s*\]', ']', json_questions)  # remove ", ]"
    parsed_results[doc_id] = json.loads(cleaned)
    print("parsed")


c02e79ef
parsed
1f6520ca
parsed
7842b56a
parsed
0bbf41ec
parsed
63394d91
parsed
2ed9b986
parsed
93e2c8ed
parsed
a482086d
parsed
eb56ae98
parsed
4292531b
parsed
ea739c65
parsed
cb257ee5
parsed
04aa4897
parsed
9681be3b
parsed
a1daf537
parsed
be5bfee4
parsed
0e424a44
parsed
29865466
parsed
016d46a1
parsed
47972cb1
parsed
ddf6c1b3
parsed
ac25d3af
parsed
251218fc
parsed
3c0114ce
parsed
f43f5fe7
parsed
d061525d
parsed
1cd01b2c
parsed
e4a7c3b0
parsed
7cd1912e
parsed
52393fb3
parsed
10515af5
parsed
cdb86a97
parsed
3e0114ad
parsed
b2799574
parsed
2f19301f
parsed
7c700adb
parsed
44b14808
parsed
76e4baf6
parsed
48b533a8
parsed
954044d1
parsed
a820b9b3
parsed
f2945cd2
parsed
eb9d376f
parsed
72f25f6d
parsed
a1e59afc
parsed
71c10610
parsed
17a5aea1
parsed
5a275db7
parsed
7ec0f9b0
parsed
bb1ba786
parsed
2f83dbe7
parsed
543ff080
parsed
d407d65b
parsed
c9375c56
parsed
e866156b
parsed
16370470
parsed
316df755
parsed
f3aa9252
parsed
a4abe7a5
parsed
fb930700
parsed
aa187680
parsed
b000e899
parsed
9c66759f

In [101]:
#results['c02e79ef']

In [102]:
#parsed_results['e41b100c']

In [103]:
#print(json_questions)

In [104]:
#json_questions

In [98]:
#!pwd


In [99]:
#!ls -lh results.bin

In [100]:
# from IPython.display import FileLink
# FileLink('results.bin')

In [105]:
parsed_results

{'c02e79ef': ['What is the specific date and time when the course is set to begin?',
  'How can I stay updated with course announcements and important dates?',
  'Is there a registration process required before the course starts?',
  "What platform should I use to access the course's public calendar?",
  'Where can I find the link to register for the course?'],
 '1f6520ca': ['What specific skills or knowledge do I need before enrolling in this course?',
  'Can you point me to where I can find the requirements for this course?',
  'Are there any prior courses or experiences necessary for joining this program?',
  'Is there a resource that outlines the prerequisites for this course?',
  'What should I have completed before I start this course?'],
 '7842b56a': ['Is it possible to enroll in the course after it has begun?',
  'If I miss the registration, can I still participate in homework assignments?',
  'Are there any specific deadlines I should know about for the final project?',
  'Wha

In [106]:
doc_index = {d['id']: d for d in documents}

In [110]:
print(doc_index)



In [112]:
course = doc_index['c02e79ef']['course']
print(course)

data-engineering-zoomcamp


In [113]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [114]:
final_results

[('What is the specific date and time when the course is set to begin?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('How can I stay updated with course announcements and important dates?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('Is there a registration process required before the course starts?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ("What platform should I use to access the course's public calendar?",
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('Where can I find the link to register for the course?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('What specific skills or knowledge do I need before enrolling in this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Can you point me to where I can find the requirements for this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Are there any prior courses or experiences necessary for joining this program?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Is there a resource that outlines th

In [115]:
import pandas as pd

In [117]:
df = pd.DataFrame(final_results, columns=['questions', 'course', 'document'])

In [119]:
df.to_csv('ground-truth-data.csv', index =  False)

In [124]:
!head ground-truth-data.csv

questions,course,document
What is the specific date and time when the course is set to begin?,data-engineering-zoomcamp,c02e79ef
How can I stay updated with course announcements and important dates?,data-engineering-zoomcamp,c02e79ef
Is there a registration process required before the course starts?,data-engineering-zoomcamp,c02e79ef
What platform should I use to access the course's public calendar?,data-engineering-zoomcamp,c02e79ef
Where can I find the link to register for the course?,data-engineering-zoomcamp,c02e79ef
What specific skills or knowledge do I need before enrolling in this course?,data-engineering-zoomcamp,1f6520ca
Can you point me to where I can find the requirements for this course?,data-engineering-zoomcamp,1f6520ca
Are there any prior courses or experiences necessary for joining this program?,data-engineering-zoomcamp,1f6520ca
Is there a resource that outlines the prerequisites for this course?,data-engineering-zoomcamp,1f6520ca


In [125]:
!pwd

/workspaces/project_llm1/03_evaluation


In [126]:
!ls -lh ground-truth-data.csv

-rw-rw-rw- 1 codespace codespace 542K Oct 11 19:07 ground-truth-data.csv


In [127]:
from IPython.display import FileLink
FileLink('ground-truth-data.csv')