# Applying python-docx to HTML content

In [10]:
# https://github.com/dfop02/html4docx/blob/main/html4docx/h4d.py

import requests
import re
from bs4 import BeautifulSoup
from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT

docx = Document()

def getHTMLdocument(url):
    response = requests.get(url)
    return response.text

def add_paragraph_style(paragraph, style):
    paragraph.style = docx.styles[style]

In [11]:
url_to_scrape = "https://www.sntl-publishing.com/neues/arbeitsmedizin/"
html_document = getHTMLdocument(url_to_scrape)
html_document = re.sub(r'<!.*?->','', html_document)
soup = BeautifulSoup(html_document, 'html.parser')
content = soup.find('section',attrs={'class':'g-py-50'})

In [12]:
divs = content.find_all("div")
for div in divs:
	div.unwrap()

a_elements = content.find_all('a',attrs={'class':'fancybox'})
for a_element in a_elements:
	a_element.unwrap()

In [13]:
# [e for e in content]

In [14]:
for element in content:
	if element.name == 'title':
		docx.core_properties.title = element.get_text()
	if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
		heading_level = int(element.name[1])
		docx.add_heading(element.text, level=2)
	if element.name == 'p':
		if element.has_attr('class'):
			if 'caption' in element['class']:
				docx.add_paragraph(element.text, style='Caption')
			else:
				docx.add_paragraph(element.text)
		docx.add_paragraph(element.text)
	if element.name == 'ul':
		for li in element.find_all('li'):
			docx.add_paragraph(li.text, style='ListBullet')
	if element.name == 'table':
		table = docx.add_table(rows=1, cols=1)
		table.style = 'Table Grid'
		table.alignment = WD_TABLE_ALIGNMENT.CENTER
		for row in element.find_all('tr'):
			cells = row.find_all(['td','th'])
			table.add_row()
			for i, cell in enumerate(cells):
				table.cell(i,0).text = cell.text
	if element.name == 'img':
		if element.has_attr('src'):
			img_name = element['src'].split('/')[-1]
			if not element['src'].startswith('http'):
				element['src'] = 'https://www.sntl-publishing.com' + element['src']
		try:
			response = requests.get(element['src'])
			with open(img_name, 'wb') as f:
				f.write(response.content)
			docx.add_picture(img_name)
		except:
			pass

docx.save('demo.docx')