# Applying python-docx to ZMS content

## Load basic libraries

In [2]:
import ZODB
import os
from Products.Five.browser.tests.pages import SimpleView
from Testing.makerequest import makerequest 							# makerequest(context)
from Testing.ZopeTestCase.testZODBCompat import make_request_response 	# make_request_response()[1]
from Acquisition import aq_get
from Products.zms import standard
from Products.zms import rest_api
import json

import re
from bs4 import BeautifulSoup
import requests

import docx
from docx.shared import Pt
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.style import WD_STYLE_TYPE

## Open ZODB

In [3]:
# Create a ZODB connection to an existing ZODB database file
try:
	wd = '/home/zope/instance/zms5_dev/var/'
	db = ZODB.DB(os.path.join(wd, 'Data.fs'))
	conn = db.open()
	root = conn.root
	###{'Application': <Application at >}
except:
	db.close()
	print('Error: Database connection had to be closed before reopened.')

## Get ZODB Context

In [5]:
# ZMS-Node /myzms2/content
# ##############################################
context = root.Application.myzms2.content 
# ##############################################

# Add REQUEST to zmscontext object
context = makerequest(context)
# Add REQUEST vars
context.REQUEST.environ.setdefault('SERVER_NAME','localhost')
context.REQUEST.environ.setdefault('SERVER_PORT', '8087')
context.REQUEST['URL' ]= 'http://localhost:8087'
context.REQUEST.set('lang','ger')
context.REQUEST.set('path_to_handle','')
# Add RESPONSE
context.REQUEST.set('RESPONSE', make_request_response()[1])

zmscontext = context.e5
request = rest_api._get_request(zmscontext)

# Example API calls for extracting content from ZMS objects

In [6]:
context.e34.e36.e37.attr('standard_json')

[{'id': 'e37',
  'meta_id': 'ZMSTextarea',
  'parent_id': 'e36',
  'parent_meta_id': 'ZMSDocument',
  'docx_format': 'html',
  'content': '<p>Lorem <a data-id="{$uid:43888e7d-e08f-4248-a36e-0b72c41bcb4f}" href="http://nohost/myzms2/content/e34/e43/">ipsum </a>dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At v

## DOCX-XML Helper Functions

1. `add_page_number(run)` : add page number to text-run (e.g. footer)
2. `add_bottom_border(style)` : adds border-properties to paragraph-style-object

_Hint: the docx API does not support the page counter directly. We have to create a custom footer with a page counter._

In [12]:
from docx.oxml import OxmlElement, ns

def create_element(name):
	return OxmlElement(name)

def create_attribute(element, name, value):
	element.set(ns.qn(name), value)

# PAGE NUMBER
def add_page_number(run):
	fldChar1 = create_element('w:fldChar')
	create_attribute(fldChar1, 'w:fldCharType', 'begin')

	instrText = create_element('w:instrText')
	create_attribute(instrText, 'xml:space', 'preserve')
	instrText.text = "PAGE"

	fldChar2 = create_element('w:fldChar')
	create_attribute(fldChar2, 'w:fldCharType', 'end')

	run._r.append(fldChar1)
	run._r.append(instrText)
	run._r.append(fldChar2)

# BOOKMARK ZMS-ID
def prepend_bookmark(block, bookmark_id):
	bookmark_start = create_element('w:bookmarkStart')
	create_attribute(bookmark_start, 'w:id', bookmark_id)
	create_attribute(bookmark_start, 'w:name', bookmark_id)
	bookmark_end = create_element('w:bookmarkEnd')
	create_attribute(bookmark_end, 'w:id', bookmark_id)
	try:
		block._element.insert(0, bookmark_end)
		block._element.insert(0, bookmark_start)
	except:
		pass


# BORDER BOTTOM
def add_bottom_border(style):
	border = create_element('w:pBdr') # pBdr = Paragraph border
	bottom = create_element('w:bottom')
	create_attribute(bottom, 'w:val', 'single')
	create_attribute(bottom, 'w:sz', '2')
	create_attribute(bottom, 'w:space', '9')
	create_attribute(bottom, 'w:color', '017D87')
	border.append(bottom)
	style.element.pPr.append(border) # pPr = Paragraph properties


## Convert HTML-Richtext to Word-Docx 

In [26]:
# #############################################	
# INIT DOCUMENT
# #############################################	

doc = docx.Document()	# Hint: may use template like docx.Document('template.docx')

# #############################################	
# STYLES
# #############################################	
styles = doc.styles
# Custom colors: #017D87
custom_color1 = docx.shared.RGBColor(1, 125, 135)
# Normal
styles['Normal'].font.name = 'Arial'
styles['Normal'].font.size = Pt(9)
styles['Normal'].paragraph_format.space_after = Pt(6)
styles['Normal'].paragraph_format.space_before = Pt(6)
styles['Normal'].paragraph_format.line_spacing = 1.35
# Headlines derived from Normal
styles['Heading 1'].basedOn = doc.styles['Normal']
styles['Heading 1'].font.size = Pt(24)
styles['Heading 1'].font.color.rgb = custom_color1
styles['Heading 2'].basedOn = doc.styles['Normal']
styles['Heading 2'].font.size = Pt(18)
styles['Heading 2'].font.color.rgb = custom_color1
styles['Heading 3'].basedOn = doc.styles['Normal']
styles['Heading 3'].font.size = Pt(12)
styles['Heading 3'].font.bold = True
styles['Heading 3'].font.color.rgb = custom_color1
# More styles derived from Normal
styles.add_style('Description', WD_STYLE_TYPE.PARAGRAPH)
styles['Description'].basedOn = doc.styles['Normal']
styles['Description'].font.name = 'Arial'
styles['Description'].font.size = Pt(9)
styles['Description'].font.italic = True
styles['Description'].font.color.rgb = custom_color1
styles['Description'].paragraph_format.space_after = Pt(18)
styles['Description'].paragraph_format.line_spacing = 1.35
add_bottom_border(styles['Description'])
styles['Caption'].font.size = Pt(8)
styles['Caption'].font.italic = True
styles['Caption'].font.color.rgb = custom_color1
styles['Description'].paragraph_format.space_before = Pt(14)
styles['Caption'].paragraph_format.space_after = Pt(4)



# #############################################
# HTML/RICHTEXT PROCESSOR
# #############################################
def add_runs(docx_block, bs_element):
	# #########################################
	# Add a minimum set of inline runs
	# the BeautifulSoup block element may contain
	# to the docx-block, e.g. <strong>, <em>, <a>
	# #########################################
	if bs_element.children:
		for elrun in bs_element.children:
			if elrun.name == 'strong':
				docx_block.add_run(elrun.text).bold = True
			elif elrun.name == 'em':
				docx_block.add_run(elrun.text).italic = True
			elif elrun.name == 'a':
				docx_block.add_run(elrun.text).hyperlink = elrun['href']
			else:
				docx_block.add_run(str(elrun))
	else:
		docx_block.text(bs_element.text)


def add_htmlblock_to_docx(docx, htmlblock):
	# remove comments
	htmlblock = re.sub(r'<!.*?->','', htmlblock)

	# Apply BeautifulSoup and iterate over elements
	soup = BeautifulSoup(htmlblock, 'html.parser')

	# Iterate over elements
	c = 0
	for element in soup.children:
		if element.name != None and element not in ['\n']:
			# Block type and counter is needed for determining last inserted block
			docx_block_type = 'paragraph'
			c+=1
			if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
				heading_level = int(element.name[1])
				docx.add_heading(element.text, level=heading_level)

			elif element.name == 'p':
				p = docx.add_paragraph()
				if element.has_attr('class'):
					if 'caption' in element['class']:
						p.style('Caption')
				add_runs(docx_block = p, bs_element = element)

			elif element.name in ['ul','ol']:
				def add_list(docx, element, level=0):
					li_styles = {'ul':'ListBullet', 'ol':'ListNumber'}
					level_suffix = level!=0 and str(level+1) or ''
					for li in element.find_all('li', recursive=False):
						docx.add_paragraph(li.contents[0].strip(), style='%s%s'%(li_styles[element.name], level_suffix))
						for ul in li.find_all(['ul','ol'], recursive=False):
							add_list(docx, ul, level+1)
				add_list(docx, element, level=0)

			elif element.name == 'table':
				docx_block_type = 'table'
				caption = element.find('caption')
				if caption:
					docx.add_paragraph(caption.text, style='Caption')
					docx_block_type = 'paragraph'
				rows = element.find_all('tr')
				cols = rows[0].find_all(['td','th'])
				table = docx.add_table(rows=len(rows), cols=len(cols))
				table.style = 'Table Grid'
				table.alignment = WD_TABLE_ALIGNMENT.CENTER
				r=-1
				for row in rows:
					r+=1
					cells = row.find_all(['td','th'])
					for i, cl in enumerate(cells):
						table.cell(r,i).text = cl.text
						if cl.name == 'th':
							table.cell(r,i).paragraphs[0].runs[0].bold = True

			elif element.name == 'img':
				if element.has_attr('src'):
					img_name = element['src'].split('/')[-1]
					if not element['src'].startswith('http'):
						element['src'] = str(context.REQUEST['URL' ]) + str(element['src'])
				try:
					response = requests.get(element['src'])
					with open(img_name, 'wb') as f:
						f.write(response.content)
					docx.add_picture(img_name)
				except:
					pass
			else:
				docx.add_paragraph(str(element))

	return (docx, c, docx_block_type)

## Example: Creating a Word-Docx file from a ZMS page

In [27]:
zmsdoc = context.e34.e36.attr('standard_json')
heading = zmsdoc[0]
blocks = zmsdoc[1:]

dt = standard.getLangFmtDate(context, heading.get('last_change_dt',''), 'eng', '%Y-%m-%d')
url = heading.get('url','').replace('nohost','localhost')
doc.sections[0].header.paragraphs[0].text = '%s (%s) URL: %s'%(heading.get('title',''), dt, url)
add_page_number(doc.sections[0].footer.paragraphs[0].add_run('Seite '))

title = doc.add_heading(heading.get('title',''), level=1)
prepend_bookmark(doc.paragraphs[-1], heading.get('id',''))

if heading.get('description','')!='':
	descr = doc.add_paragraph(heading.get('description',''))
	descr.style = 'Description'

for block in blocks:
	v = block['content']
	if v and block['docx_format'] == 'html':
		doc, add_count, docx_block_type = add_htmlblock_to_docx(docx=doc, htmlblock=v)
	else:
		doc.add_paragraph(v, style=block['docx_format'])
		add_count = 1
		docx_block_type = 'paragraph'
	# Add bookmark to the last inserted block
	if block.get('id'):
		# For prepending bookmark we need to know the number of formerly inserted blocks
		last_block = doc.paragraphs[-add_count]
		if docx_block_type == 'table':
			last_block = doc.tables[-add_count]
		prepend_bookmark(last_block, block['id'])

# doc.add_page_break()
doc.save('test.docx')

In [None]:
# # Finally close ZODB connection
db.close()