Install python docx from https://python-docx.readthedocs.io/en/latest/user/install.html

In [None]:
!pip install python-docx

In [3]:
from docx.api import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph

In [5]:
import glob
import pickle

from pprint import pprint

# Read the docx file

In [9]:
# path to directory containing docx files
docx_dir = './docx/'

docx_files = glob.glob(docx_dir + '*.docx')

In [11]:
docx_files[0]

'./docx\\Bagerhat.docx'

# Helper functions

In [12]:
def iter_block_items(parent):
    """
    Take a Document object and yields all paragraph and table objects
    in the order inside the docx file.
    
    Input:
        - parent (docx.api.Document object)
        
    Source: https://stackoverflow.com/questions/42093013/processing-objects-in-order-in-docx
    """
    parent_elm = parent.element.body

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

# Parse the docx file

In [13]:
# the docx contents will go in this
all_contents = []

document = Document(docx_files[0])

# iterate through each block of the docx file
for block_item in iter_block_items(document):
    
    # check if the block is a paragraph object
    if isinstance(block_item, Paragraph):
        inner_text = block_item.text
        
        # check if inner text is empty
        if inner_text != '':
            try:
                # print("\nPARA:", block_item.style.name, inner_text)
                
                # store the parapragh info in a nested array
                all_contents.append([
                    "p",
                    block_item.style.name,
                    inner_text
                ])
            except:
                continue
    
    # check if the block is a table object
    elif isinstance(block_item, Table):
        # print("\nTABLE:")
        
        inner_values = []
        #iterate through each row and the cell values
        for row in block_item.rows:
            inner_values.append([cell.text for cell in row.cells])

        # store the values in a nested array
        all_contents.append([
            "t",
            inner_values
        ])
    else:
        print("\nUNKNOWN")

Number of paraghs and tables:

In [14]:
len(all_contents)

653

## How to read the tables and paragraphs:

In [15]:
tables = [a for a in all_contents if a[0] == 't']
paraghs = [a for a in all_contents if a[0] == 'p']

In [16]:
pprint(tables[10])

['t',
 [['Upazila', 'Muslim', 'Hindu', 'Buddhist', 'Christian', 'Others', 'Total'],
  ['Bagerhat Sadar', '219207', '46547', '561', '4', '70', '266389'],
  ['Chitalmari', '92739', '46003', '58', '2', '8', '138810'],
  ['Fakirhat', '104951', '32628', '57', '0', '153', '137789'],
  ['Kachua', '78645', '18347', '19', '0', '0', '97011'],
  ['Mollahat', '104335', '26302', '89', '0', '152', '130878'],
  ['Mongla', '102298', '29426', '4837', '21', '6', '136588'],
  ['Morrelgonj', '263332', '31136', '34', '2', '72', '294576'],
  ['Rampal', '123250', '31253', '448', '10', '4', '154965'],
  ['Sarankhola', '109836', '9232', '12', '4', '0', '119084'],
  ['Total', '1198593', '270874', '6115', '43', '465', '1476090']]]


In [17]:
pprint(paraghs[100])

['p',
 'Body Text',
 'The population and housing census is the unique source of reliable and '
 'comprehensive data about the size of the population of the country, major '
 'socio-economic and socio-demographic characteristics.']


# Pickle the contents

In [18]:
with open("./pickles/Bagerhat.pkl", "wb") as f:
        pickle.dump(all_contents, f)
        print("Successfully pickled.")

Successfully pickled.
