# Applying python-docx to ZMS content

## Load basic libraries

In [None]:
import ZODB
import os
from Products.Five.browser.tests.pages import SimpleView
from Testing.makerequest import makerequest 							# makerequest(context)
from Testing.ZopeTestCase.testZODBCompat import make_request_response 	# make_request_response()[1]
from Acquisition import aq_get
from Products.zms import standard
from Products.zms import rest_api
import json

import re
from bs4 import BeautifulSoup
from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT

## Open ZODB

In [2]:
# Create a ZODB connection to an existing ZODB database file
try:
	wd = '/home/zope/instance/zms5_dev/var/'
	db = ZODB.DB(os.path.join(wd, 'Data.fs'))
	conn = db.open()
	root = conn.root
	###{'Application': <Application at >}
except:
	db.close()
	print('Error: Database connection had to be closed before reopened.')

## Get ZODB Context

In [None]:
# ZMS-Node /myzms2/content
context = root.Application.myzms2.content 
# ##############################################
# Add REQUEST to zmscontext object
context = makerequest(context)
# Add REQUEST vars
context.REQUEST.set('lang','ger')
context.REQUEST.set('path_to_handle','')
# Add RESPONSE
context.REQUEST.set('RESPONSE', make_request_response()[1])
# ##############################################
zmscontext = context.e5
request = rest_api._get_request(zmscontext)

# Example API calls for extracting content from ZMS objects

In [None]:
# tree_nodes = rest_api.RestApiController(context,request).list_tree_nodes(zmscontext)[0:1]

doc = rest_api.get_attrs(zmscontext)
print(json.dumps(doc,indent=2))

node = context.e4
print(json.dumps(rest_api.get_attrs(node),indent=2))

## Test Python-Script
# a_pyscript = makerequest(root.Application.myzmsx.a_pyscript)
# print(a_pyscript.read())

# Get a custom py methods standard_json of a node (zpt does not work!)
print(json.dumps(rest_api.get_attr(node,'standard_json'),indent=2))

# Convert HTML-Richtext to Word-Docx 

In [42]:
docx = Document()

def add_paragraph_style(paragraph, style):
	paragraph.style = docx.styles[style]

def add_htmlblock_to_docx(docx, htmlblock):
	htmlblock = re.sub(r'<!.*?->','', htmlblock)
	soup = BeautifulSoup(htmlblock, 'html.parser')
	for element in soup.children:
		if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
			heading_level = int(element.name[1])
			docx.add_heading(element.text, level=heading_level)
		if element.name == 'p':
			p = docx.add_paragraph()
			if element.has_attr('class'):
				if 'caption' in element['class']:
					p.style('Caption')
			if element.children:
				for elrun in element.children:
					if elrun.name == 'strong':
						p.add_run(elrun.text).bold = True
					elif elrun.name == 'em':
						p.add_run(elrun.text).italic = True
					elif elrun.name == 'a':
						p.add_run(elrun.text).hyperlink = elrun['href']
					else:
						p.add_run(str(elrun))
			else:
				p.text(element.text)

		if element.name == 'ul':
			for li in element.find_all('li'):
				docx.add_paragraph(li.text, style='ListBullet')

		if element.name == 'table':
			rows = element.find_all('tr')
			cols = rows[0].find_all(['td','th'])
			table = docx.add_table(rows=len(rows), cols=len(cols))
			table.style = 'Table Grid'
			table.alignment = WD_TABLE_ALIGNMENT.CENTER
			r=-1
			for row in rows:
				r+=1
				cells = row.find_all(['td','th'])
				for i, cl in enumerate(cells):
					table.cell(r,i).text = cl.text
					if cl.name == 'th':
						table.cell(r,i).paragraphs[0].runs[0].bold = True


		if element.name == 'img':
			if element.has_attr('src'):
				img_name = element['src'].split('/')[-1]
				if not element['src'].startswith('http'):
					element['src'] = 'https://www.sntl-publishing.com' + element['src']
			try:
				response = requests.get(element['src'])
				with open(img_name, 'wb') as f:
					f.write(response.content)
				docx.add_picture(img_name)
			except:
				pass
	return docx

## Example converting HTML-Richtext to Word-Docx

In [43]:
zmsdoc = context.e12
zmsdoc_metas = rest_api.get_attrs(zmsdoc)
children = context.e12.getObjChildren('e',request, context.PAGEELEMENTS) # returns a list of all children of type 'e' in the folder PAGEELEMENTS

for child in children:
	child_metas = rest_api.get_attrs(child)
	htmlblock = rest_api.get_attr(child,'text')
	if htmlblock:
		docx = add_htmlblock_to_docx(docx, htmlblock)
		print(htmlblock)
		print('-------------------')

docx.save('test.docx')

<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
-------------------


<h2>Lorem ipsum</h2>

<p>dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam <strong>nonumy </strong>eirmod <strong>tempor </strong>invidunt ut labore et <em>dolore </em>magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod <em>tempor </em>invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. &nbsp;&nbsp;</p>

<p><strong>Lorem ipsum dolor&

In [13]:
# # Finally close ZODB connection
db.close()