In [8]:
import io

import requests
import docx

In [None]:
def clean_line(line):
    return line.strip().strip("\ufeff")


def download_gdrive_file(file_id):
    url = f"https://docs.google.com/document/d/{file_id}/export?format=docx"
    response = requests.get(url)
    response.raise_for_status()
    return response.content


def save_file(file_id, file_name):
    with open(file_name, "wb") as f_out:
        file_stream = download_gdrive_file(file_id)
        if not file_stream:
            return False
        f_out.write(file_stream)
    return True


def read_docx(docx_path=None, stream=None):
    if docx_path:
        with open(docx_path, "rb") as f_in:
            return docx.Document(f_in)
    elif stream:
        with io.BytesIO(stream) as f_in:
            return docx.Document(f_in)
    return None


def read_faq(file_id):
    doc = read_docx(download_gdrive_file(file_id))
    if not doc:
        raise Exception("Failed to load FAQ file")

    questions = []

    question_heading_style = "heading 2"
    section_heading_style = "heading 1"

    section_title = ""
    question_title = ""
    answer_text_so_far = ""

    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)

        if len(p_text) == 0:
            continue

        if style == section_heading_style:
            section_title = p_text
            continue

        if style == question_heading_style:
            if all(
                [section_title, question_title, answer_text_so_far],
                lambda x: len(x) > 0,
            ):
                questions.append(
                    {
                        "section": section_title,
                        "question": question_title,
                        "text": answer_text_so_far,
                    }
                )
                answer_text_so_far = ""

            question_title = p_text
            continue

        answer_text_so_far += "\n" + p_text

    # answer_text_so_far = answer_text_so_far.strip() # unnecessary extra safeguard
    if answer_text_so_far != "" and section_title != "" and question_title != "":
        questions.append(
            {
                "text": answer_text_so_far,
                "section": section_title,
                "question": question_title,
            }
        )

    return questions

In [None]:
faq_documents = {
    "data-engineering-zoomcamp": "19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw",
    "machine-learning-zoomcamp": "1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8",
    "mlops-zoomcamp": "12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0",
    "llm-zoomcamp": "1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0",
}

documents = []

for course, file_id in faq_documents.items():
    print(f"{course}, link: https://docs.google.com/document/d/{file_id}")
    documents.append({"course": course, "documents": read_faq(file_id)})

data-engineering-zoomcamp, link: https://docs.google.com/document/d/19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw
machine-learning-zoomcamp, link: https://docs.google.com/document/d/1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8
mlops-zoomcamp, link: https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0


In [None]:
import json
with open('documents.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)
!head documents.json