# Applying python-docx to ZMS content

## Load basic libraries

In [9]:
import ZODB
import os
from Products.Five.browser.tests.pages import SimpleView
from Testing.makerequest import makerequest 							# makerequest(context)
from Testing.ZopeTestCase.testZODBCompat import make_request_response 	# make_request_response()[1]
from Acquisition import aq_get
from Products.zms import standard
from Products.zms import rest_api
import json

import re
from bs4 import BeautifulSoup
import requests

import docx
from docx.shared import Pt
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.style import WD_STYLE_TYPE

## Open ZODB

In [10]:
# Create a ZODB connection to an existing ZODB database file
try:
	wd = '/home/zope/instance/zms5_dev/var/'
	db = ZODB.DB(os.path.join(wd, 'Data.fs'))
	conn = db.open()
	root = conn.root
	###{'Application': <Application at >}
except:
	db.close()
	print('Error: Database connection had to be closed before reopened.')

## Get ZODB Context

In [11]:
# ZMS-Node /myzms2/content
# ##############################################
context = root.Application.myzms2.content 
# ##############################################

# Add REQUEST to zmscontext object
context = makerequest(context)
# Add REQUEST vars
context.REQUEST.environ.setdefault('SERVER_NAME','localhost')
context.REQUEST.environ.setdefault('SERVER_PORT', '8087')
context.REQUEST['URL' ]= 'http://localhost:8087'
context.REQUEST.set('lang','ger')
context.REQUEST.set('path_to_handle','')
# Add RESPONSE
context.REQUEST.set('RESPONSE', make_request_response()[1])

zmscontext = context.e5
request = rest_api._get_request(zmscontext)

# Example API calls for extracting content from ZMS objects

In [12]:
node = context.e34.e36
node.attr('standard_json')[0]

{'id': 'e36',
 'meta_id': 'ZMSDocument',
 'parent_id': 'e34',
 'parent_meta_id': 'ZMSFolder',
 'title': 'First Document',
 'descripton': '',
 'last_change_dt': time.struct_time(tm_year=2024, tm_mon=6, tm_mday=25, tm_hour=15, tm_min=48, tm_sec=11, tm_wday=1, tm_yday=177, tm_isdst=1)}

## DOCX-XML Helper Functions

1. `add_page_number(run)` : add page number to text-run (e.g. footer)
2. `add_bottom_border(style)` : adds border-properties to paragraph-style-object

_Hint: the docx API does not support the page counter directly. We have to create a custom footer with a page counter._

In [13]:
from docx.oxml import OxmlElement, ns

def create_element(name):
	return OxmlElement(name)

def create_attribute(element, name, value):
	element.set(ns.qn(name), value)

# PAGE NUMBER
def add_page_number(run):
	fldChar1 = create_element('w:fldChar')
	create_attribute(fldChar1, 'w:fldCharType', 'begin')

	instrText = create_element('w:instrText')
	create_attribute(instrText, 'xml:space', 'preserve')
	instrText.text = "PAGE"

	fldChar2 = create_element('w:fldChar')
	create_attribute(fldChar2, 'w:fldCharType', 'end')

	run._r.append(fldChar1)
	run._r.append(instrText)
	run._r.append(fldChar2)

# BORDER BOTTOM
def add_bottom_border(style):
	border = create_element('w:pBdr') # pBdr = Paragraph border
	bottom = create_element('w:bottom')
	create_attribute(bottom, 'w:val', 'single')
	create_attribute(bottom, 'w:sz', '2')
	create_attribute(bottom, 'w:space', '9')
	create_attribute(bottom, 'w:color', '017D87')
	border.append(bottom)
	style.element.pPr.append(border) # pPr = Paragraph properties


## Convert HTML-Richtext to Word-Docx 

In [14]:
# #############################################	
# INIT DOCUMENT
# #############################################	

doc = docx.Document()	# Hint: may use template like docx.Document('template.docx')

# #############################################	
# STYLES
# #############################################	
styles = doc.styles
# Custom colors: #017D87
custom_color1 = docx.shared.RGBColor(1, 125, 135)
# Normal
styles['Normal'].font.name = 'Arial'
styles['Normal'].font.size = Pt(9)
styles['Normal'].paragraph_format.space_after = Pt(6)
styles['Normal'].paragraph_format.space_before = Pt(6)
styles['Normal'].paragraph_format.line_spacing = 1.35
# Headlines derived from Normal
styles['Heading 1'].basedOn = doc.styles['Normal']
styles['Heading 1'].font.size = Pt(24)
styles['Heading 1'].font.color.rgb = custom_color1
styles['Heading 2'].basedOn = doc.styles['Normal']
styles['Heading 2'].font.size = Pt(18)
styles['Heading 2'].font.color.rgb = custom_color1
styles['Heading 3'].basedOn = doc.styles['Normal']
styles['Heading 3'].font.size = Pt(12)
styles['Heading 3'].font.bold = True
styles['Heading 3'].font.color.rgb = custom_color1
# Headlines derived from Normal
styles.add_style('Description', WD_STYLE_TYPE.PARAGRAPH)
styles['Description'].basedOn = doc.styles['Normal']
styles['Description'].font.name = 'Arial'
styles['Description'].font.size = Pt(9)
styles['Description'].font.italic = True
styles['Description'].font.color.rgb = custom_color1
styles['Description'].paragraph_format.space_after = Pt(12)
styles['Description'].paragraph_format.line_spacing = 1.35
add_bottom_border(styles['Description'])


# #############################################	
# HTML/RICHTEXT PROCESSOR
# #############################################	
def add_htmlblock_to_docx(docx, htmlblock):
	# remove comments
	htmlblock = re.sub(r'<!.*?->','', htmlblock)

	# remove empty tags
	htmlblock = re.sub(r'<[^>]*>','', htmlblock) 

	# Apply BeautifulSoup and iterate over elements
	soup = BeautifulSoup(htmlblock, 'html.parser')

	for element in soup.children:
		if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
			heading_level = int(element.name[1])
			docx.add_heading(element.text, level=heading_level)

		if element.name == 'p':
			p = docx.add_paragraph()
			if element.has_attr('class'):
				if 'caption' in element['class']:
					p.style('Caption')
			if element.children:
				for elrun in element.children:
					if elrun.name == 'strong':
						p.add_run(elrun.text).bold = True
					elif elrun.name == 'em':
						p.add_run(elrun.text).italic = True
					elif elrun.name == 'a':
						p.add_run(elrun.text).hyperlink = elrun['href']
					else:
						p.add_run(str(elrun))
			else:
				p.text(element.text)

		if element.name in ['ul','ol']:
			def add_list(docx, element, level=0):
				li_styles = {'ul':'ListBullet', 'ol':'ListNumber'}
				level_suffix = level!=0 and str(level+1) or ''
				for li in element.find_all('li', recursive=False):
					docx.add_paragraph(li.contents[0].strip(), style='%s%s'%(li_styles[element.name], level_suffix))
					for ul in li.find_all(['ul','ol'], recursive=False):
						add_list(docx, ul, level+1)
			add_list(docx, element, level=0)

		if element.name == 'table':
			rows = element.find_all('tr')
			cols = rows[0].find_all(['td','th'])
			table = docx.add_table(rows=len(rows), cols=len(cols))
			table.style = 'Table Grid'
			table.alignment = WD_TABLE_ALIGNMENT.CENTER
			r=-1
			for row in rows:
				r+=1
				cells = row.find_all(['td','th'])
				for i, cl in enumerate(cells):
					table.cell(r,i).text = cl.text
					if cl.name == 'th':
						table.cell(r,i).paragraphs[0].runs[0].bold = True

		if element.name == 'img':
			if element.has_attr('src'):
				img_name = element['src'].split('/')[-1]
				if not element['src'].startswith('http'):
					element['src'] = str(context.REQUEST['URL' ]) + str(element['src'])
			try:
				response = requests.get(element['src'])
				with open(img_name, 'wb') as f:
					f.write(response.content)
				docx.add_picture(img_name)
			except:
				pass
	return docx

## Example: Creating a Word-Docx file from a ZMS page

In [15]:
zmsdoc = context.e34.e36.attr('standard_json')
zmsdoc_metas = zmsdoc[0]
children = zmsdoc[1:]

# Set Header
doc.sections[0].header.paragraphs[0].text = 'ZMS DOCX EXAMPLE - %s/%s'%(context.REQUEST['URL'], zmsdoc_metas.get('getPath',''))
# Set Footer: add the page number field to the first footer paragraph
add_page_number(doc.sections[0].footer.paragraphs[0].add_run('Seite '))

# Add title + metadata
title = doc.add_heading(zmsdoc_metas.get('title',''), level=1)
if zmsdoc_metas.get('description','')!='':
	descr = doc.add_paragraph(zmsdoc_metas.get('description',''))
	descr.style = 'Description'

# Add content
for child in children:
	v = child['content']
	if v and child['docx_format'] == 'html':
		doc = add_htmlblock_to_docx(docx = doc, htmlblock = v)
	else:
		doc.add_paragraph(v, style=child['docx_format'])


# doc.add_page_break()
doc.save('test.docx')

In [8]:
# # Finally close ZODB connection
# db.close()