In [147]:
!pip install python-docx

Defaulting to user installation because normal site-packages is not writeable


In [148]:
file_id = "19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw"



In [149]:
import requests
import io
import docx

In [150]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line  # <-- important!

def read_faq(file_id):
    url = f"https://docs.google.com/document/d/{file_id}/export?format=docx"

    response = requests.get(url)
    response.raise_for_status()

    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    section_heading_style = 'Heading 1'
    question_heading_style = 'Heading 2'
    

    section_title = ''
    question_title = ''
    answer_text_so_far = ''

    for p in doc.paragraphs:
        style = p.style.name
        p_text = clean_line(p.text)

        if len(p_text) == 0:
            continue

        if style == section_heading_style:
            section_title = p_text
            continue

        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
            answer_text_so_far = ''
            question_title = p_text
            continue

        answer_text_so_far += '\n' + p_text

    # Add the last question after the loop ends
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions




In [151]:
p = doc.paragraphs[2]

In [152]:
documents = []

current_answer = []
current_section = ''
current_question = ''

section_heading_style = 'heading 1'
question_heading_style = 'heading 2'

for p in doc.paragraphs:
    p_text = p.text.strip()
    p_style = p.style.name.lower()

    if p_text == '':
        continue

    if p_style == section_heading_style:
        current_section = p_text
        current_answer = []
        continue

    if p_style == question_heading_style:
        # Sauvegarder l'ancienne question
        if current_section and current_question and current_answer:
            answer = '\n'.join(current_answer)
            documents.append({
                'section': current_section, 
                'question': current_question,
                'text': answer
            })
        # Nouvelle question
        current_question = p_text
        current_answer = []
        continue

    # Accumuler les lignes de réponse
    current_answer.append(p_text)

# Ajouter la dernière question
if current_section and current_question and current_answer:
    answer = '\n'.join(current_answer)
    documents.append({
        'section': current_section, 
        'question': current_question,
        'text': answer
    })


In [153]:
len(documents)

439

In [154]:
documents[1]

{'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'text': 'See DE zoomcamp 2025 pre-course Q&A\nTo get the most out of this course, you should have:\nBasic coding experience\nFamiliarity with SQL\nExperience with Python (helpful but not required)\nNo prior data engineering experience is necessary. See Readme on GitHub'}

In [155]:
for p in doc.paragraphs:
    print(f"'{p.text.strip()}' — style: {p.style.name}")

'Data Engineering Zoomcamp FAQ' — style: Title
'Data Engineering Zoomcamp FAQ' — style: Title
'The purpose of this document is to capture Frequently asked technical questions' — style: normal
'Editing guidelines:' — style: normal
'When adding a new FAQ entry, make sure the question is “Heading 2”' — style: normal
'Feel free to improve if you see something is off' — style: normal
'Don’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)' — style: normal
'Don’t change the pages format (it should be “pageless”)' — style: normal
'Add name and date for reference, if possible' — style: normal
'General course-related questions' — style: Heading 1
'Course - When does the course start?' — style: Heading 2
'The next cohort starts January 13th 2025. More info at DTC.' — style: normal
'Register before the course starts using this link.' — style: normal
'Joint the course Telegram channel with announce

In [156]:
print(f"Total questions parsed: {len(documents)}")

for item in documents[:3]:  # just first 3 entries
    print(f"\n[SECTION] {item['section']}")
    print(f"[QUESTION] {item['question']}")
    print(f"[ANSWER]\n{item['text']}")

Total questions parsed: 439

[SECTION] General course-related questions
[QUESTION] Course - When does the course start?
[ANSWER]
The next cohort starts January 13th 2025. More info at DTC.
Register before the course starts using this link.
Joint the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.

[SECTION] General course-related questions
[QUESTION] Course - What are the prerequisites for this course?
[ANSWER]
See DE zoomcamp 2025 pre-course Q&A
To get the most out of this course, you should have:
Basic coding experience
Familiarity with SQL
Experience with Python (helpful but not required)
No prior data engineering experience is necessary. See Readme on GitHub

[SECTION] General course-related questions
[QUESTION] Course - Can I still join the course after the start date?
[ANSWER]
Yes, even if you don't register, you're still eligible to submit the homework.
Be aware, however, that there will be deadlines for turnin

In [157]:
file_id = "19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw"
faq_data = read_faq(file_id)
print(f"✅ Total questions: {len(faq_data)}")
print(faq_data[0])

✅ Total questions: 446
{'text': "The next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When does the course start?'}
