## Import Required Libraries

In [1]:
import cv2
import json
import numpy as np
import os
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from math import ceil
from PIL import Image

## Load Dataset

Option 1 - Load from drive

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
! unzip /content/drive/MyDrive/BTP_Dataset/unlv_images.zip

Option 2 - Download from https://drive.google.com/drive/folders/14Citi2bfhyQKfDonzcyv4RO9A0bNCJYw?usp=sharing and Upload to Colab

In [None]:
! unzip unlv_images.zip

## Initalize Empty Folders

In [4]:
! mkdir row
! mkdir row/img
! mkdir row/mask
! mkdir col
! mkdir col/img
! mkdir col/mask

In [6]:
DS = "/content/"
TRAIN_ROW = "/content/row/img/"
TRAIN_COL = "/content/col/img/"
ROW_MASK = "/content/row/label/"
COL_MASK = "/content/col/label/"

## Run script on UNLV

In [7]:
c = 0; read = 0; write = 0

for file in os.listdir(DS + "unlv_xml_gt"):

    if file.endswith(".xml"):
    
        row_file = file.replace(".xml",".png")
        col_file = file.replace(".xml",".png")

        tree = ET.parse(DS + "unlv_xml_gt/" + file) 
        
        # getting the parent tag of the xml document 
        root = tree.getroot()

        doc_file = file.replace(".xml",".png")
        document = cv2.imread(DS + "unlv_images/" + doc_file)

        for elem in root:

            local_r = document.copy()
            local_r[:,:] = 0

            local_c = document.copy()
            local_c[:,:] = 0

            for subelem in elem.findall('Table'):
                table_dict = subelem.attrib
                t_left = int(table_dict['x0'])
                t_right = int(table_dict['x1'])
                t_top = int(table_dict['y0'])
                t_down = int(table_dict['y1'])
                
                local_r[t_top:t_down,t_left:t_right] = 255
                local_c[t_top:t_down,t_left:t_right] = 255

                for subsubelem in subelem.findall('Column'):
                    col_dict = subsubelem.attrib
                    c_left = int(col_dict['x0'])
                    c_right = int(col_dict['x1'])
                    c_top = int(col_dict['y0'])
                    c_down = int(col_dict['y1'])

                    local_c = cv2.line(local_c, (c_left,c_top), (c_right,c_down), (0,0,0), 8) 

                for subsubelem in subelem.findall('Row'):
                    row_dict = subsubelem.attrib
                    r_left = int(row_dict['x0'])
                    r_right = int(row_dict['x1'])
                    r_top = int(row_dict['y0'])
                    r_down = int(row_dict['y1'])
                    
                    local_r = cv2.line(local_r, (r_left,r_top), (r_right,r_down), (0,0,0), 8) 

            # crop the tables from doc
            document = document[t_top:t_down,t_left:t_right]
            local_r = local_r[t_top:t_down,t_left:t_right]
            local_c = local_c[t_top:t_down,t_left:t_right]
            
            # resize to 512x512
            document = cv2.resize(document, (512, 512))
            local_c = cv2.resize(local_c, (512, 512))
            local_r = cv2.resize(local_r, (512, 512))
            
            # Thresholding values below and above 128 to be 0 and 1 respectively
            # To create Mask
            ret, local_r = cv2.threshold(local_r, 128, 1, cv2.THRESH_BINARY)
            ret, local_c = cv2.threshold(local_c, 128, 1, cv2.THRESH_BINARY)
            
            # Save Mask
            cv2.imwrite(TRAIN_ROW + doc_file, document)
            cv2.imwrite(TRAIN_COL + doc_file, document)
            cv2.imwrite(ROW_MASK + row_file, local_r)
            cv2.imwrite(COL_MASK + col_file, local_c)

In [9]:
!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.4.3-py3-none-any.whl (7.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.4.3


## Split folders to generate train and test dataset

In [10]:
import splitfolders

In [11]:
splitfolders.ratio("/content/row", output="/content/row", seed=42, ratio=(.8, .2), group_prefix=None)
splitfolders.ratio("/content/col", output="/content/col", seed=42, ratio=(.8, .2), group_prefix=None)

Copying files: 846 files [00:00, 5936.07 files/s]
Copying files: 846 files [00:00, 5495.46 files/s]


In [12]:
!rm -rf /content/row/img
!rm -rf /content/row/mask
!rm -rf /content/col/img
!rm -rf /content/col/mask

## Download Processed Data

In [None]:
!zip -r /content/row.zip /content/row

from google.colab import files
files.download("/content/row.zip")

!zip -r /content/col.zip /content/col

from google.colab import files
files.download("/content/col.zip")