# Introduction

This notebook is not fixed for all files, since many of the 64 pickle files will cause error when running.
The document is mainly used to merge some tables and clean empty paragraphs and remove unnecessary punctuation.
Then the table and paragraph blocks are taken and sliced into chapters and exported.

You can change the `INDEX` to process each file, and errors will show up in many of the `INDEX` values.

In [2]:
import pickle
import glob
from pprint import pprint

In [3]:
pkl_files = list(glob.glob("./pickles/*.pkl"))

print("# pickle files:", len(pkl_files))

# pickle files: 64


In [1819]:
# you can change this index to choose the files from pkl_files
# the index can be from 0 to 63
INDEX = 63

with open(pkl_files[INDEX], 'rb') as f:
    print("Processing", pkl_files[INDEX])
    contents = pickle.load(f)

print(len(contents))

Processing ./pickles\Thakurgaon.pkl
1091


# Merging tables

In [1820]:
def merge_tables(input_list):
    """
    If two table blocks are next to each other, merge them.
    If the length of a paragraph is less than 5, don't add it and discard it.
    """
    print("Length before merging:", len(input_list))
    
    output = []
    i = 0
    while i < len(input_list)-1:
        # if there are two tables right next to each other
        if input_list[i][0] == 't' and input_list[i+1][0] == 't':
            print("Merged table at", i)
            output.append(['t', input_list[i][1] +  input_list[i+1][1]])
            i+=2
        else:
            # otherwise, if a single table or a paragraph with greater than 5 characters
            if input_list[i][0] == 't' or \
            (input_list[i][0] == 'p' and len(input_list[i][2]) > 5):
                output.append(input_list[i])
            i+=1
    print("Length after merging", len(output))
    
    return output

contents = merge_tables(contents)

Length before merging: 1091
Merged table at 422
Merged table at 425
Merged table at 428
Merged table at 431
Merged table at 434
Merged table at 437
Merged table at 440
Merged table at 443
Merged table at 446
Merged table at 449
Merged table at 452
Merged table at 455
Merged table at 464
Merged table at 467
Merged table at 470
Merged table at 480
Merged table at 483
Merged table at 486
Merged table at 489
Merged table at 492
Merged table at 495
Merged table at 498
Merged table at 501
Merged table at 504
Merged table at 507
Merged table at 511
Merged table at 514
Merged table at 517
Merged table at 520
Merged table at 523
Merged table at 528
Merged table at 533
Merged table at 538
Merged table at 543
Merged table at 548
Merged table at 552
Merged table at 556
Merged table at 561
Merged table at 566
Merged table at 571
Merged table at 576
Merged table at 581
Merged table at 586
Merged table at 591
Merged table at 596
Merged table at 601
Merged table at 606
Merged table at 611
Merged table

In [1821]:
def count(input_list):
    """
    Returns the number of tables and paragraph objects in the input list.
    """
    counts = {
        'p': 0,
        't': 0
    }
    
    for item in input_list:
        if item[0] == 'p':
            counts['p'] += 1
        elif item[0] == 't':
            counts['t'] += 1
    
    return counts

count(contents)

{'p': 727, 't': 167}

# Chapterwise slicing

In [1822]:
# types = []

# for a in contents:
#     if a[0] == 'p' and a[1] not in types:
#         types.append(a[1])
        
# pprint(types)

In [1823]:
# looking for the chapter titles and their indices

for a in contents:
    if a[0] == 'p' and "CHAPTER" in a[2]:
        print(a[1], a[2], contents.index(a))

Heading 1 CHAPTER 1 288
Heading 1 CHAPTER 2 414
Heading 1 CHAPTER 3 445
Heading 1 CHAPTER 4 455
Heading 1 CHAPTER 5 600
Heading 1 CHAPTER 6 646
Heading 1 CHAPTER 7 684
Heading 1 CHAPTER 8 724
Heading 1 CHAPTER 9 746
Heading 1 CHAPTER 10 772
Heading 1 CHAPTER 11 809
Heading 1 CHAPTER 12 841


In [1824]:
def remove_punctuation(input_str):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    no_punct = ""
    for char in input_str:
        if char not in punctuations:
            no_punct = no_punct + char
    no_punct = no_punct.replace("  ", " ")
            
    return no_punct

In [1825]:
chapter_indices = {}

chapter_indices["START"] = 0
chapter_indices["END"] = len(contents)-1

for i, a in enumerate(contents):
        if a[1] == "Heading 1": # this 'Heading 1' causes errors in some files, change it to other
            cleaned_title = remove_punctuation(a[2]).replace("  ", " ")
            chapter_indices[cleaned_title] = i

pprint(chapter_indices)

{'CHAPTER 1': 288,
 'CHAPTER 10': 772,
 'CHAPTER 11': 809,
 'CHAPTER 12': 841,
 'CHAPTER 2': 414,
 'CHAPTER 3': 445,
 'CHAPTER 4': 455,
 'CHAPTER 5': 600,
 'CHAPTER 6': 646,
 'CHAPTER 7': 684,
 'CHAPTER 8': 724,
 'CHAPTER 9': 746,
 'END': 893,
 'START': 0}


In [1826]:
# you can manually change the chapter indices

# chapter_indices['CHAPTER 3'] = 309

In [1827]:
# print(list(chapter_indices.keys()))

In [1829]:
# this is a dictionary that will contain the chapterwise sorted content
chapterwise_content = {}

for i in range(1, 13):
    if i == 0:
        chap_no = "START"
    else:
        chap_no = "CHAPTER {}".format(i)
    
    if i == 12:
        next_chap = "END"
    else:
        next_chap = "CHAPTER {}".format(i+1)
        
    if chap_no in chapter_indices.keys():
        chap_start = chapter_indices[chap_no] + 1
        chap_end = chapter_indices[next_chap]
        chapterwise_content[chap_no] = contents[chap_start:chap_end]
    else:
        print("[ERROR]", chap_no, "missing")

In [None]:
# count the number of table and paragh blocks in each chapter

for x,y in chapterwise_content.items():
    print(x, "---", len(y), "blocks")

In [1831]:
pprint(chapterwise_content['CHAPTER 2'][:8])

[['p', 'Normal', 'General Information'],
 ['p', 'Heading 4', 'General Information'],
 ['p', 'List Paragraph', 'Broad classification of area 2011'],
 ['p', 'Normal', '(In sq. km.)'],
 ['t',
  [['Upazila', 'Total area', 'Land area', 'Reserve\nforest', 'Riverine\narea'],
   ['Baliadangi', '284.12', '234.12', '0', '50.00'],
   ['Haripur', '201.07', '199.96', '0', '1.11'],
   ['Pirgonj', '353.99', '346.88', '4.28', '2.83'],
   ['Ranisankail', '287.59', '283.34', '0.23', '4.02'],
   ['Thakurgaon Sadar', '654.95', '651.81', '0.36', '2.78'],
   ['Total', '1781.72', '1716.11', '4.87', '60.74']]],
 ['p',
  'List Paragraph',
  'Number of municipality, union, mauza, mahalla and village 2011'],
 ['t',
  [['Upazila',
    'Municipality',
    'Ward\n(PSA)',
    'Mahalla',
    'Union',
    'Mauza',
    'Village',
    'Depopulated\nmauza'],
   ['Baliadangi', '0', '0', '0', '8', '78', '78', '0'],
   ['Haripur', '0', '0', '0', '6', '75', '73', '3'],
   ['Pirgonj', '1', '9', '11', '10', '168', '168', '0'],

In [1832]:
district_name = pkl_files[INDEX].split("\\")[-1].split(".")[0]

# you should create a `pickle2 directory`
output_file_name = "./pickles2/{}.pkl".format(district_name)

with open(output_file_name, 'wb') as f:
    pickle.dump(chapterwise_content, f)
    print("Successfully created", output_file_name)

Successfully created ./pickles2/Thakurgaon.pkl
