In [3]:
file_id = '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw'

url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'

In [1]:
import requests
import io
import docx

In [4]:
response = requests.get(url)
response.raise_for_status

with io.BytesIO(response.content) as f_in:
    doc = docx.Document(f_in)

In [5]:
doc

<docx.document.Document at 0x77ac9c6668f0>

```We can take paragraphs via lines from our docs file```

In [6]:
doc.paragraphs[0]

<docx.text.paragraph.Paragraph at 0x77acac283760>

```Convert it to text format```

In [7]:
p = doc.paragraphs[0]

p.text

'The purpose of this document is to capture frequently asked technical questions'

In [8]:
p = doc.paragraphs[5]

p.text

'Join the course Telegram channel with announcements.'

```Also we can find out which type of line we get by check it with commands style. For example we have several types of lines: normal (casual text), Heading 2 (sub section), Heading (section name). By defining type of text we can parse it to format: 'course_name' - 'question' - 'answer'```

In [15]:
p = doc.paragraphs[1]

p.text, p.style

('General course-related questions',
 _ParagraphStyle('Heading 1') id: 131582891203008)

In [16]:
p.style.name.lower()

'heading 1'

```set text type style```

In [17]:
question_heading_style = 'heading 2'
section_heading_style = 'heading 1'

In [19]:
documents = []

current_answer = []
current_section = ''
current_question = ''


for p in doc.paragraphs:
    p_text = p.text.strip() # in order to delete whitespaces
    p_style = p.style.name.lower()

    if p_text == '':
        continue

    if p_style == section_heading_style:
        current_section = p.text
        current_answer = []
        continue 

    if p_style == question_heading_style:
        if current_section and current_question and current_answer:
            answer = '\n'.join(current_answer)
    
            documents.append({
                'section': current_section,
                'question': current_question,
                'text': answer
            })
    
        current_question = p_text
        current_answer = []
        continue

    current_answer.append(p_text)


if current_answer:
    answer = '\n'.join(current_answer)

    documents.append({
        'section': current_section,
        'question': current_question,
        'text': answer
    })

In [20]:
documents[0]

{'section': 'General course-related questions',
 'question': 'Course - When will the course start？',
 'text': "The next cohort starts in Jan 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."}

In [21]:
documents[2]

{'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."}