# NLP Assignment

In [248]:
# !pip install PyMuPDF
# !pip install fitz
# !pip install camelot-py
# !pip install ghostscript
# !apt install ghostscript python3-tk

## Imports

In [250]:
import fitz
import camelot

## Task 1

- Download the publicly available reports and extract the contents out of it.  
- I have used `Fitz` to get text data and `Camelot` to get tabular data
- Final output will be a dictionary containing text and table data for each page
- Pass pdf_file path to function `get_doc_data` it will return output_json


In [256]:
def read_table_using_camelot(pdf_path, page_no):
    '''
    Function to read pdf and return tables
    '''
    table_dict = []

    try:
        tables = camelot.read_pdf(pdf_path, pages=f"{page_no+1}")

        if tables:
            for table in tables:
                table_dict.append(
                    {
                        'data':table.df.to_dict('records'),
                        # "data": table.df,
                        "bbox": table._bbox,
                    }
                )

            return table_dict
        else:
            return []

    except Exception as e:
        print("Exception in read_table_using_camelot", e)
        return table_dict

def change_y_of_table(tables, height):
    """
    Function to change y coordinate of camelot table
    Allign that with fitz coordinate
    """
    for table in tables:
        x1, y1, x2, y2 = table["bbox"]
        y11 = height - y2
        y22 = height - y1
        table["bbox"] = (x1, y11, x2, y22)
    return tables


def add__d(data, bbox, type_):
    data = {"data": data, "bbox": bbox, "type": type_}
    return data


def add_tables_to_output(pdf_path, h, pp, text_data):
    try:
        # get tables using camelot
        tables = read_table_using_camelot(pdf_path, 0)
        tables = change_y_of_table(tables, h)

        if tables:
            for table in tables:
                data = add__d(table["data"], table["bbox"], "table")
                text_data[pp]["data"].append(data)
            return text_data, tables
        else:
            return text_data, []
    except Exception as e:
        print("Exception in add_tables_to_output", e)
        return text_data, []

def check_overlap(t1, t2):
    y1, y2 = int(t1[0]), int(t1[1])
    r1 = set(list(range(y1, y2)))

    y1, y2 = int(t2[0]), int(t2[1])
    r2 = set(list(range(y1, y2)))

    overlap = r1.intersection(r2)
    return len(overlap) > 0


def check_if_table_overlap(tables, bbox, overlap_flag):
    """
    Function to check if line coordinate overlaps with table coordinate
    """
    tables_bbox_list = [t["bbox"] for t in tables]
    for t in tables_bbox_list:
        if check_overlap((bbox[1], bbox[3]), (t[1], t[3])):
            overlap_flag = True
        else:
            overlap_flag = False
    return overlap_flag


def get_doc_data(pdf_path):
    """
    Function to return text block from given doc.

    Args:
        pdf_path (str): pdf file path
    Returns:
        text_data: Dict containing list of text and bbox
    """

    text_data = {}

    try:
        doc = fitz.open(pdf_path)
        for page_no in range(doc.page_count):

            page = doc[page_no]
            w, h = page.mediabox.width, page.mediabox.height

            texts, bbox_list = [], []

            pp = f"page_{page_no}"
            text_data[pp] = {}
            text_data[pp]['width'] = w
            text_data[pp]['height'] = h
            text_data[pp]['data'] = []

            # get table data using camelot and add it to output json (text_data)
            text_data, tables = add_tables_to_output(pdf_path, h, pp, text_data)

            for block in page.get_text("dict")["blocks"]:
                if "image" in block.keys():
                    continue

                for line in block["lines"]:
                    line_text = ""
                    for span in line["spans"]:
                        line_text += f'{span["text"]} '

                    if line_text.strip():
                      bbox = [round(coord, 2) for coord in line["bbox"]]
                      data = add__d(line_text.strip(), bbox, 'line')

                      ## check overlap of line with table data, if line overlap with table bbox then do not add line
                      if not tables:
                        text_data[pp]['data'].append(data)
                      else:
                        overlap_flag = False
                        overlap_flag = check_if_table_overlap(tables, bbox, overlap_flag)
                        if not overlap_flag:
                          data = add__d(line_text.strip(), bbox, 'line')
                          text_data[pp]['data'].append(data)

            # sort data as per top
            d = text_data['page_0']['data']
            text_data['page_0']['data'] = sorted(d, key=lambda d: d['bbox'][1])
    except Exception as e:
        print("Exception in get_doc_data", e)
    return text_data


In [257]:
pdf_path = "/content/drive/MyDrive/Colab Notebooks/dimensionless/table.pdf"
output_json = get_doc_data(pdf_path)

In [258]:
output_json

{'page_0': {'width': 612.0,
  'height': 792.0,
  'data': [{'data': 'Example table',
    'bbox': [72.0, 95.86, 166.3, 112.32],
    'type': 'line'},
   {'data': 'This is an example of a data table.',
    'bbox': [72.0, 115.59, 295.68, 131.55],
    'type': 'line'},
   {'data': [{0: 'Disability \nCategory',
      1: 'Participants',
      2: 'Ballots \nCompleted',
      3: 'Ballots \nIncomplete/ \nTerminated',
      4: 'Results',
      5: ''},
     {0: '', 1: '', 2: '', 3: '', 4: 'Accuracy', 5: 'Time to \ncomplete'},
     {0: 'Blind', 1: '5', 2: '1', 3: '4', 4: '34.5%, n=1', 5: '1199 sec, n=1'},
     {0: 'Low Vision',
      1: '5',
      2: '2',
      3: '3',
      4: '98.3% n=2 \n(97.7%, n=3)',
      5: '1716 sec, n=3 \n(1934 sec, n=2)'},
     {0: 'Dexterity',
      1: '5',
      2: '4',
      3: '1',
      4: '98.3%, n=4',
      5: '1672.1 sec, n=4'},
     {0: 'Mobility',
      1: '3',
      2: '3',
      3: '0',
      4: '95.4%, n=3',
      5: '1416 sec, n=3'}],
    'bbox': (84.24, 144.0

### Get text data

In [259]:
no = 0
lines = [d['data'] for d in output_json[f'page_{no}']['data'] if d['type'] == 'line']
print(lines)

['Example table', 'This is an example of a data table.']


### Get table data
-Table is converted to json

In [261]:
no = 0
tables = [d['data'] for d in output_json[f'page_{no}']['data'] if d['type'] == 'table']
if tables:
    print(tables[0])

[{0: 'Disability \nCategory', 1: 'Participants', 2: 'Ballots \nCompleted', 3: 'Ballots \nIncomplete/ \nTerminated', 4: 'Results', 5: ''}, {0: '', 1: '', 2: '', 3: '', 4: 'Accuracy', 5: 'Time to \ncomplete'}, {0: 'Blind', 1: '5', 2: '1', 3: '4', 4: '34.5%, n=1', 5: '1199 sec, n=1'}, {0: 'Low Vision', 1: '5', 2: '2', 3: '3', 4: '98.3% n=2 \n(97.7%, n=3)', 5: '1716 sec, n=3 \n(1934 sec, n=2)'}, {0: 'Dexterity', 1: '5', 2: '4', 3: '1', 4: '98.3%, n=4', 5: '1672.1 sec, n=4'}, {0: 'Mobility', 1: '3', 2: '3', 3: '0', 4: '95.4%, n=3', 5: '1416 sec, n=3'}]
