# Layer Analysis on Single File

In [1]:
# import io (input and output); BytesIO encode string to byte object
from io import BytesIO
# extract all file name in a folder, for the convenience of reading PDF files
import glob
# re (regular expression) to find string with certain patterns
import re

In [2]:
case_list = glob.glob('./case_test/*.pdf')

In [3]:
case_list

['./case_test/001 - Intellectual Ventures I LLC v Motorola Mobility LLC.pdf',
 './case_test/051 - Ohio Willow Wood Co v Alps South LLC.pdf',
 './case_test/001 - In re Jobdiva Inc.pdf',
 './case_test/058 - Ericsson Inc v D-Link Systems Inc.pdf',
 './case_test/035 - Info-Hold Inc v Applied Media Technologies Corp.pdf',
 "./case_test/042 - Align Technology Inc v International Trade Com'n.pdf",
 './case_test/03 - Ashley Furniture Industries Inc v US.pdf',
 './case_test/010 - KS Himpp v Hear-Wear Technologies LLC.pdf',
 './case_test/028 - Shenyang Yuanda Aluminum Industry Engineering Co Ltd v US.pdf']

In [6]:
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

#### Device Configure for PDF reader

In [17]:
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

In [18]:
document = open(case_list[3], 'rb')

In [19]:
layout_text = []
# maxpages to set the number of page you want to read for the pdf file
for page in PDFPage.get_pages(document,maxpages=0):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            layout_text.append(element.get_text())

In [10]:
def print_layout(layout_text):
    for i in range(len(layout_text)):
        print('{} {} {}'.format('------------- Layer', i, " ---------------" ))
        print(layout_text[i])

In [22]:
layout_text[278]

'Opinion\n'

In [12]:
for layer in layout_text:
    if layer.find("Plaintiff") != -1:
        print(layer)

ERICSSON, INC., Telefonaktiebolaget LM Ericsson, and Wi–Fi One, LLC, Plaintiffs–Appellees,



In [13]:
for layer in layout_text:
    if layer.find("Defendant") != -1:
        print(layer)

Corporation, and Gateway, Inc., Defendants–Appellants,

Dell, Inc., Defendant–Appellant,

Toshiba America Information Systems, Inc. and Toshiba Corporation, Defendants–Appellants,

Belkin International, Inc., Defendant.



In [14]:
for layer in layout_text:
    if layer.find("Decided") != -1:
        print(layer)

In [15]:
for layer in layout_text:
    if layer.find("Synopsis") != -1:
        print(layer)

Synopsis
Background: Patentee commenced action against competitors, alleging infringement of patents generally relating to Wi–
Fi technology employed by electronic devices to wirelessly access the Internet. The United States District Court for the
Eastern District of Texas, Leonard Davis, J., 2013 WL 4046225, denied competitor's post-judgment motions after a jury
verdict in the patentee's favor and upheld the jury's infringement and validity findings and refused to grant a new trial.
Competitors appealed.



In [16]:
for layer in layout_text:
    if re.search('(Reversed|Affirmed)', layer) != None:
        print(layer)

Affirmed in part, reversed in part, vacated in part, and remanded.



In [56]:
string = "Affirmed in part and reversed in part."

# check if 'Python' is at the beginning
match = re.search('Reversed', string)

In [58]:
match==None

True

In [20]:
for layer in layout_text:
    if re.search('(Before)(.*)(Judges.)', layer) != None:
        print(layer)

Before O'MALLEY, TARANTO, and HUGHES, Circuit Judges.



In [None]:
for layer in layout_text:
    if layer == 'Opinion\n':
layout_text.index('Opinion\n')

In [24]:
layout_text[layout_text.index('Opinion\n')+1]

"O'MALLEY, Circuit Judge.\n"

In [7]:
layout_text = []
for page in PDFPage.get_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            layout_text.append(element.get_text())

In [9]:
def decompose_layer(file):
    document = open(file, 'rb')
#     #Create resource manager
#     rsrcmgr = PDFResourceManager()
#     # Set parameters for analysis.
#     laparams = LAParams()
#     # Create a PDF page aggregator object.
#     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
#     interpreter = PDFPageInterpreter(rsrcmgr, device)

    layout_text = []
    for page in PDFPage.get_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBoxHorizontal):
                layout_text.append(element.get_text())
                
    return layout_text

In [11]:
def write_layout(layout_text, filename):
    file = open(filename,'w') 
    for i in range(len(layout_text)):
        file.write('{} {} {}'.format('------------- Layer', i, " ---------------\n" ))
        file.write(layout_text[i])
    file.close() 

In [15]:
for file in case_list[0:3]:
    layout_text = decompose_layer(file)
    file_name ='{}{}'.format(file.replace("./","").replace(".pdf",""),".txt")
    write_layout(layout_text, file_name)    