# Step - 1: Calculating Mean and Std of dataset

In [1]:
import os
import sys
import cv2
import numpy as np
import pandas as pd
from PIL import Image
import glob
import matplotlib.pyplot as plt
import xmltodict
import json
from tqdm.notebook import tqdm

from pycocotools.coco import COCO

In [2]:
root_dir = "PRImA_Layout_Analysis_Dataset/";
img_dir = "Images/";
anno_dir = "XML/";
final_root_dir="Document_Layout_Analysis/"
files = os.listdir(root_dir + img_dir);

In [3]:
def normalize():
    channel_sum = np.zeros(3)
    channel_sum_squared = np.zeros(3)
    num_pixels=0
    count=0
    for file in files:
        file_path=root_dir+img_dir+file
        img=cv2.imread(file_path)
        img= img/255.
        num_pixels += (img.size/3)
        channel_sum += np.sum(img, axis=(0, 1))
        channel_sum_squared += np.sum(np.square(img), axis=(0, 1))

    mean = channel_sum / num_pixels
    std = np.sqrt((channel_sum_squared/num_pixels) - mean**2)
    rgb_mean = list(mean)[::-1]
    rgb_std = list(std)[::-1]
    return rgb_mean, rgb_std

In [5]:
mean, std = normalize()
mean=[x*255 for x in mean]
print(mean)
print(std)

[205.4947792530714, 210.0757122321346, 205.5501630335441]
[0.2429826679074847, 0.24957906533432994, 0.2625039283385689]


In [5]:
#[205.4947792530714, 210.0757122321346, 205.5501630335441]
#[0.2429826679074847, 0.24957906533432994, 0.2625039283385689]

# Step 2: TIFF to JPEG

In [4]:
from PIL import Image
import glob

for name in glob.glob(root_dir+img_dir+'*.tif'):
    im = Image.open(name)
    name = str(name).rstrip(".tif")
    name = str(name).lstrip(root_dir)
    name = str(name).lstrip(img_dir)
    im.save(final_root_dir+ img_dir+ name + '.jpg', 'JPEG')

# Step 3: Selective Data Augmentation and VOC to Monk Type
- Later on Monk Format type be converted to any Format type as per the model

# VOC Format

## Dataset Directory Structure

    home/PRImA Layout Analysis Dataset/ (root_dir)
          |
          |-----------Images (img_dir)
          |              |
          |              |------------------img1.jpg
          |              |------------------img2.jpg
          |              |------------------.........(and so on)
          |
          |
          |-----------Annotations (anno_dir)
          |              |
          |              |------------------img1.xml
          |              |------------------img2.xml
          |              |------------------.........(and so on)
          


# Monk Format

## Dataset Directory Structure

    home/PRImA Layout Analysis Dataset/ (root_dir)
          |
          |-----------Images (img_dir)
          |              |
          |              |------------------img1.jpg
          |              |------------------img2.jpg
          |              |------------------.........(and so on)
          |
          |
          |-----------train_labels.csv (anno_file)
          
          
## Annotation file format

           | Id         | Labels                                 |
           | img1.jpg   | x1 y1 x2 y2 label1 x1 y1 x2 y2 label2  |
           
- Labels:  xmin ymin xmax ymax label
- xmin, ymin - top left corner of bounding box
- xmax, ymax - bottom right corner of bounding box

In [5]:
files = os.listdir(root_dir + anno_dir);

In [6]:
combined = [];

In [7]:
def augmentData(fname, labels):
    image = cv2.imread(final_root_dir+img_dir+fname)
    
    height, width = image.shape[:2] 
    quarter_height, quarter_width = height / 4, width / 4
    down_trans=np.random.randint(10, quarter_height+1)
    up_trans=np.random.randint(-1*quarter_height, -10)
    right_trans=np.random.randint(10, quarter_width+1)
    left_trans=np.random.randint(-1*quarter_width, -10)
    
    T1 = np.float32([[1, 0, right_trans], [0, 1, up_trans]])
    T2 = np.float32([[1, 0, left_trans], [0, 1, down_trans]])
    T3 = np.float32([[1, 0, right_trans], [0, 1, down_trans]])
    T4 = np.float32([[1, 0, left_trans], [0, 1, up_trans]])
    
    img_translation1 = cv2.warpAffine(image, T1, (width, height)) 
    img_translation2 = cv2.warpAffine(image, T2, (width, height)) 
    img_translation3 = cv2.warpAffine(image, T3, (width, height)) 
    img_translation4 = cv2.warpAffine(image, T4, (width, height)) 
    
    cv2.imwrite(final_root_dir+img_dir+'1'+fname, img_translation1)
    cv2.imwrite(final_root_dir+img_dir+'2'+fname, img_translation2)
    cv2.imwrite(final_root_dir+img_dir+'3'+fname, img_translation3)
    cv2.imwrite(final_root_dir+img_dir+'4'+fname, img_translation4)
    
    tmp = labels.split(" ");
    label1=""
    label2=""
    label3=""
    label4=""
    for j in range(len(tmp)//5):
        x1 = float(tmp[j*5 + 0]);
        y1 = float(tmp[j*5 + 1]);
        x2 = float(tmp[j*5 + 2]);
        y2 = float(tmp[j*5 + 3]);
        label = tmp[j*5 + 4];
        
        xr1= min(x1+right_trans, width)
        xr2= min(x2+right_trans, width)
        xl1= max(x1+left_trans,0)
        xl2= max(x2+left_trans,0)
        yu1= max(y1+up_trans,0)
        yu2= max(y2+up_trans,0)
        yd1= min(y1+down_trans, height)
        yd2= min(y2+down_trans, height)
        
        if(xr1!=xr2 and yu1!=yu2):
                label1+= str(int(xr1))+' '+str(int(yu1))+' '+str(int(xr2))+' '+str(int(yu2))+' '+label+' '
        if(xr1!=xr2 and yd1!=yd2):
                label3+= str(int(xr1))+' '+str(int(yd1))+' '+str(int(xr2))+' '+str(int(yd2))+' '+label+' '
        if(xl1!=xl2 and yu1!=yu2):
                label4+= str(int(xl1))+' '+str(int(yu1))+' '+str(int(xl2))+' '+str(int(yu2))+' '+label+' '
        if(xl1!=xl2 and yd1!=yd2):
                label2+= str(int(xl1))+' '+str(int(yd1))+' '+str(int(xl2))+' '+str(int(yd2))+' '+label+' '
                
    label1=label1[:-1]
    label2=label2[:-1]
    label3=label3[:-1]
    label4=label4[:-1]
    
    combined.append(['1'+fname, label1])
    combined.append(['2'+fname, label2])
    combined.append(['3'+fname, label3])
    combined.append(['4'+fname, label4])
        
        

In [8]:
for i in tqdm(range(len(files))):
    augment=False;
    annoFile = root_dir + anno_dir + files[i];
    f = open(annoFile, 'r');
    my_xml = f.read();
    anno= dict(dict(dict(xmltodict.parse(my_xml))['PcGts'])['Page'])
    fname=""
    for j in range(len(files[i])):
        if((files[i][j])>='0' and files[i][j]<='9'):
            fname+=files[i][j];
    fname+=".jpg"
    
    label_str = ""
    for key in anno.keys():
        if(key=='@imageFilename' or key=='@imageWidth' or key=='@imageHeight'):
            continue
        if(key=="TextRegion"):
            if(type(anno["TextRegion"]) == list):
                for j in range(len(anno["TextRegion"])):
                    text=anno["TextRegion"][j]
                    xmin=int(anno['@imageWidth'])
                    ymin=int(anno['@imageHeight'])
                    xmax=0
                    ymax=0
                    if(text["Coords"]):
                        if(text["Coords"]["Point"]):
                            for k in range(len(text["Coords"]["Point"])):
                                coordinates=anno["TextRegion"][j]["Coords"]["Point"][k]
                                xmin= min(xmin, int(coordinates['@x']));
                                ymin= min(ymin, int(coordinates['@y']));
                                xmax= max(xmax, int(coordinates['@x']));
                                ymax= max(ymax, int(coordinates['@y']));
                            if('@type' in text.keys()):    
                                label_str+= str(xmin)+' '+str(ymin)+' '+str(xmax)+' '+str(ymax)+' '+text['@type']+' '
            else:
                text=anno["TextRegion"]
                xmin=int(anno['@imageWidth'])
                ymin=int(anno['@imageHeight'])
                xmax=0
                ymax=0
                if(text["Coords"]):
                    if(text["Coords"]["Point"]):
                        for k in range(len(text["Coords"]["Point"])):
                            coordinates=anno["TextRegion"]["Coords"]["Point"][k]
                            xmin= min(xmin, int(coordinates['@x']));
                            ymin= min(ymin, int(coordinates['@y']));
                            xmax= max(xmax, int(coordinates['@x']));
                            ymax= max(ymax, int(coordinates['@y']));
                        if('@type' in text.keys()):    
                            label_str+= str(xmin)+' '+str(ymin)+' '+str(xmax)+' '+str(ymax)+' '+text['@type']+' '
        
        else:
            val=""
            if(key=='GraphicRegion'):
                val="graphics"
                augment=True
            elif(key=='ImageRegion'):
                val="image"
            elif(key=='NoiseRegion'):
                val="noise"
                augment=True
            elif(key=='ChartRegion'):
                val="chart"
                augment=True
            elif(key=='TableRegion'):
                val="table"
                augment=True
            elif(key=='SeparatorRegion'):
                val="separator"
            elif(key=='MathsRegion'):
                val="maths"
                augment=True
            elif(key=='LineDrawingRegion'):
                val="linedrawing"
                augment=True
            else:
                val="frame"
                augment=True

            
            if(type(anno[key]) == list):
                for j in range(len(anno[key])):
                    text=anno[key][j]
                    xmin=int(anno['@imageWidth'])
                    ymin=int(anno['@imageHeight'])
                    xmax=0
                    ymax=0
                    if(text["Coords"]):
                        if(text["Coords"]["Point"]):
                            for k in range(len(text["Coords"]["Point"])):
                                coordinates=anno[key][j]["Coords"]["Point"][k]
                                xmin= min(xmin, int(coordinates['@x']));
                                ymin= min(ymin, int(coordinates['@y']));
                                xmax= max(xmax, int(coordinates['@x']));
                                ymax= max(ymax, int(coordinates['@y']));
                        label_str+= str(xmin)+' '+str(ymin)+' '+str(xmax)+' '+str(ymax)+' '+ val +' '
            else:
                text=anno[key]
                xmin=int(anno['@imageWidth'])
                ymin=int(anno['@imageHeight'])
                xmax=0
                ymax=0
                if(text["Coords"]):
                    if(text["Coords"]["Point"]):
                        for k in range(len(text["Coords"]["Point"])):
                            coordinates=anno[key]["Coords"]["Point"][k]
                            xmin= min(xmin, int(coordinates['@x']));
                            ymin= min(ymin, int(coordinates['@y']));
                            xmax= max(xmax, int(coordinates['@x']));
                            ymax= max(ymax, int(coordinates['@y']));  
                        label_str+= str(xmin)+' '+str(ymin)+' '+str(xmax)+' '+str(ymax)+' '+val+' '

    label_str=label_str[:-1]
    combined.append([fname, label_str])
    if(augment):
        augmentData(fname, label_str)
        

HBox(children=(FloatProgress(value=0.0, max=475.0), HTML(value='')))




In [9]:
df = pd.DataFrame(combined, columns = ['ID', 'Label']);
df.to_csv(final_root_dir + "/train_labels.csv", index=False);

In [10]:
counts = dict()
count=0
for ID, labels in combined:
    count+=1;
    tmp = labels.split(" ");
    for j in range(len(tmp)//5):
        word = tmp[j*5 + 4];
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
print(counts) 
print(count)

{'header': 416, 'paragraph': 19568, 'caption': 1394, 'drop-capital': 507, 'footer': 1111, 'image': 1972, 'noise': 246, 'heading': 3447, 'credit': 790, 'page-number': 833, 'floating': 1004, 'separator': 3344, 'chart': 206, 'graphics': 1436, 'table': 236, 'maths': 200, 'linedrawing': 70, 'frame': 61}
1783
