In [1]:
import io

import requests
import docx

In [3]:
url_de_zoomcamp = 'https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw/edit?tab=t.0#heading=h.edeyusfgl4b7'
url_ml_zoomcamp = 'https://docs.google.com/document/d/1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8/edit?tab=t.0#heading=h.s7drv4piz29d'
url_mlops_zoomcamp = 'https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit?tab=t.0#heading=h.sh0bgh8fj5rw'

In [17]:
file_id = '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw'

url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'

In [18]:
response = requests.get(url)

response.raise_for_status()

In [25]:
doc = io.BytesIO(response.content)


doc = docx.Document(doc)

In [69]:
a = doc.paragraphs[0]

a

<docx.text.paragraph.Paragraph at 0x73bfa83e96a0>

In [74]:
a.text

'Data Engineering Zoomcamp FAQ'

In [79]:
p = doc.paragraphs[0:20]

In [67]:
type(p)

list

In [80]:
for i in p:
    print(i.style.name)
    print(i.text)
    print('\n')


Title
Data Engineering Zoomcamp FAQ


Title
    Data Engineering Zoomcamp FAQ


normal
The purpose of this document is to capture Frequently asked technical questions


normal
Editing guidelines:


normal
When adding a new FAQ entry, make sure the question is “Heading 2”


normal
Feel free to improve if you see something is off


normal
Don’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)


normal
Don’t change the pages format (it should be “pageless”)


normal
Add name and date for reference, if possible


Heading 1
General course-related questions


Heading 2
Course - When does the course start?


normal
The next cohort starts January 13th 2025. More info at DTC.


normal
Register before the course starts using this link.


normal
Joint the course Telegram channel with announcements.


normal
Don’t forget to register in DataTalks.Club's Slack and join the channel.


Heading 2
Course

In [56]:
p.style.name.lower()

'heading 1'

In [57]:
p.text, p.style

('General course-related questions',
 _ParagraphStyle('Heading 1') id: 127266995478576)

In [22]:
response.headers

{'Content-Type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'X-Robots-Tag': 'noindex, nofollow, nosnippet', 'Cache-Control': 'no-cache, no-store, max-age=0, must-revalidate', 'Pragma': 'no-cache', 'Expires': 'Mon, 01 Jan 1990 00:00:00 GMT', 'Date': 'Fri, 13 Jun 2025 20:27:46 GMT', 'Content-Disposition': 'attachment; filename="DataEngineeringZoomcampFAQ.docx"; filename*=UTF-8\'\'Data%20Engineering%20Zoomcamp%20FAQ.docx', 'Transfer-Encoding': 'chunked', 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'Cache-Control,Content-Disposition,Content-Length,Content-Type,Date,Expires,Pragma,Server,Transfer-Encoding,X-Google-GFE-Backend-Request-Cost', 'Content-Security-Policy': "require-trusted-types-for 'script';report-uri https://csp.withgoogle.com/csp/docs-tt, frame-ancestors 'self' https://docs.google.com, base-uri 'self';object-src 'none';report-uri https://doc-08-10-docstext.googleusercontent.com/document/cspreport;script-src 'nonce-wn9z2f

In [4]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [5]:
faq_documents = {
    'data-engineering-zoomcamp': '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw',
    'machine-learning-zoomcamp': '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8',
    'mlops-zoomcamp': '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0',
}

In [6]:
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

data-engineering-zoomcamp
machine-learning-zoomcamp
mlops-zoomcamp


In [7]:
import json

In [8]:
with open('documents.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [9]:
!head documents.json


[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {
