# Identify the Position of References by Layer analysis

*Yi Yin*

## Table of Contents

1. Extract References
2. Position of the References



#### Environment Information

- show the python version and system for this computer
- help others who use this code to produce replicable result

In [2]:
import IPython

# Information of my Python version, computer system
print(IPython.sys_info())

{'commit_hash': '523ed2fe5',
 'commit_source': 'installation',
 'default_encoding': 'UTF-8',
 'ipython_path': '/anaconda/envs/nlp/lib/python3.6/site-packages/IPython',
 'ipython_version': '7.2.0',
 'os_name': 'posix',
 'platform': 'Darwin-18.5.0-x86_64-i386-64bit',
 'sys_executable': '/anaconda/envs/nlp/bin/python',
 'sys_platform': 'darwin',
 'sys_version': '3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n'
                '[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'}


#### Read PDF files

In [3]:
# import pdfminer and to read PDF files
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

# import io (input and output); BytesIO encode string to byte object
from io import BytesIO

# extract all file name in a folder, for the convenience of reading PDF files
import glob

# latter use to store cleaned string into json file
import simplejson as json

# re (regular expression) to find string with certain patterns
import re

#### A function to read PDF file:
 
    pdf_file: the filename of PDF (including the path (i.e location) )
    return: contentt of the PDF (string in Byte object,
    remember we use BytesIO to encode our string result)


In [4]:
def read_pdf(pdf_file):

    resource_mgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(resource_mgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_mgr, device)
    fp = open(pdf_file, 'rb')
    maxpages = 0
    caching = True
    pagenos = set()
    
    for page in PDFPage.get_pages(fp, pagenos, 
                                  maxpages=maxpages, 
                                  caching=caching, 
                                  check_extractable=True):
        interpreter.process_page(page)

    result_str = retstr.getvalue()
    
    fp.close()
    device.close()
    retstr.close()
    
    return result_str

In [5]:
case_list = glob.glob('./case_test/*.pdf')
# a list to store the name of the pdf
pdf_name = []
# a list to store the content of the pdf
pdf_content = []

# a loop to read all the pdf and store their name and content to the respective list
for case in case_list:
    pdf_name.append(case.replace('./case_test/', '').replace('.pdf', ''))
    pdf_content.append(read_pdf(case))

In [6]:
pdf_content[0][:300]

b'Intellectual Ventures I LLC v. Motorola Mobility LLC, 870 F.3d 1320 (2017)\n124 U.S.P.Q.2d 1129\n\n870 F.3d 1320\n\nUnited States Court of Appeals,\n\nFederal Circuit.\n\nINTELLECTUAL VENTURES I LLC, Intellectual Ventures II LLC, Plaintiffs\xe2\x80\x93Appellees\n\nMOTOROLA MOBILITY LLC, fka Motorola Mobility, INC., Def'

Decode the PDF contents

In [7]:
court_text = []
for content in pdf_content:
    # decode every pdf file content, sepecify the decode style "utf-8"
    # p.s. "utf-8" is the most commom encoding sytle today
    decoded_content = content.decode("utf-8") 
    court_text.append(decoded_content) 

In [8]:
type(court_text[0])

str

In [16]:
pdf_name

['001 - Intellectual Ventures I LLC v Motorola Mobility LLC',
 '051 - Ohio Willow Wood Co v Alps South LLC',
 '001 - In re Jobdiva Inc',
 '058 - Ericsson Inc v D-Link Systems Inc',
 '035 - Info-Hold Inc v Applied Media Technologies Corp',
 "042 - Align Technology Inc v International Trade Com'n",
 '03 - Ashley Furniture Industries Inc v US',
 '010 - KS Himpp v Hear-Wear Technologies LLC',
 '028 - Shenyang Yuanda Aluminum Industry Engineering Co Ltd v US']

## 1. Extract Reference

### 1.1. Extract US patent

function to extract US patent

In [9]:
import regex as re
from typing import Generator

In [11]:
US_PATENT = r"""
(?P<block1>[0-9]{1})[\,]?(?P<block2>[0-9]{3})[\,]?(?P<block3>[0-9]{3})
"""
RE_PATENT = re.compile(US_PATENT, re.IGNORECASE | re.UNICODE | re.DOTALL | re.VERBOSE)

In [17]:
def get_patent(text, return_sources=False) -> Generator:
    """
    Find possible US patent references in the text.
    :param text:
    :param return_sources:
    :return:
    """

    # Iterate through all potential matches
    for match in RE_PATENT.finditer(text):
        # Get individual group matches
        captures = match.capturesdict()
        patent = "{block1},{block2},{block3}".format(block1=captures["block1"].pop(),
                                                  block2=captures["block2"].pop(),
                                                  block3=captures["block3"].pop(),
                                                  )

        if return_sources:
            yield patent, match.group()
        else:
            yield patent


In [30]:
patent_list = []
for text in court_text:
    # extract patent number in a pdf file, store in a list
    text_patent =list(get_patent(text))
    # remove duplicated patent in the list
    patent_list.append(list(dict.fromkeys(text_patent)))   

### 1.2. Extract Citations

In [276]:
from reporters_db import EDITIONS

In [277]:
for key in list(EDITIONS.keys()):
    EDITIONS[key.replace(" ", "")]=EDITIONS[key].replace(" ", "")

In [279]:
import pandas as pd

In [280]:
CITATION_PTN = r"""
(?:[\s,:\(]|^)
(
(\d+)\s+
({reporters})(\s|[a-z])+
(\d+)
)
""".format(reporters='|'.join([re.escape(i) for i in EDITIONS]))
CITATION_PTN_RE = re.compile(CITATION_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

In [281]:
result = CITATION_PTN_RE.findall(court_text[1])

In [284]:
def extract_citations(text):
    result = CITATION_PTN_RE.findall(text)
    cite_all = []
    for cite in result:
        the_cite = cite[0].replace("at ", "").replace("  "," ").replace("  "," ").replace("\n", " ")
        if len(the_cite)<=30:
            cite_all.append(the_cite)
    return cite_all
    

In [285]:
cite_list = []
for text in court_text:
    text_citation = extract_citations(text)
    
    # cite_in_pdf is a list contains all the citations in a pdf file
    cite_in_pdf = []
    cite_in_pdf = list(dict.fromkeys(text_citation))
        
    # cite_list is a list contatins lists of citations for pdf files in the folder
    cite_list.append(cite_in_pdf)
        

### 1.3. Extract Regulations

In [288]:
import lexnlp.extract.en.regulations

In [289]:
reg_list = []
for text in court_text:
    text_regulation =list(lexnlp.extract.en.regulations.get_regulations(text, 
                                                              return_source=False,
                                                             as_dict=True))
    
    # reg_in_pdf is a list contains all the regulations in a pdf file
    reg_in_pdf = []
    for reg in text_regulation:
        
        # sotre every regulation in the pdf file to this container
        reg_in_pdf.append(reg['regulation_code'])
        
        # remove duplicated regulations in a file
        reg_in_pdf = list(dict.fromkeys(reg_in_pdf)) 
    
    # reg_list is a list contatins lists of regulations for pdf files in the folder
    reg_list.append(reg_in_pdf)

In [291]:
reference_extract = pd.DataFrame(
    {'file_name': pdf_name,
     'citations': cite_list,
     'regulations': reg_list,
     'patents': patent_list,
    })

In [292]:
reference_extract.head()

Unnamed: 0,file_name,citations,regulations,patents
0,001 - Intellectual Ventures I LLC v Motorola M...,"[870 F.3d 1320, 176 F.Supp.3d 405, 72 F.Supp.3...","[28 USC § 1292, 35 USC § 112, 35 USC § 271, 35...","[5,379,340, 5,436,857, 5,553,145, 5,798,733, 7..."
1,051 - Ohio Willow Wood Co v Alps South LLC,"[813 F.3d 1350, 2012 WL 2196083, 2012 WL 32834...",[37 CFR § 1],"[2,196,083, 3,283,437, 3,309,635, 4,775,374, 5..."
2,001 - In re Jobdiva Inc,"[843 F.3d 936, 2015 WL 2170162, 2015 WL 354284...","[37 CFR § 2, 15 USC § 1125, 28 USC § 1295, 15 ...","[2,170,162, 3,542,849, 9,205,082, 2,851,917, 3..."
3,058 - Ericsson Inc v D-Link Systems Inc,"[773 F.3d 1201, 2013 WL 4046225, 1 packet out ...","[28 USC § 1295, 35 USC § 271, 35 USC § 284, 35...","[4,046,225, 6,424,625, 6,466,568, 6,772,215, 2..."
4,035 - Info-Hold Inc v Applied Media Technologi...,"[783 F.3d 1262, 2013 WL 1787007, 2012 WL 39303...",[28 USC § 1295],"[1,787,007, 5,991,374, 6,741,683, 3,930,376, 1..."


In [293]:
reference_extract.to_csv("Apr11_extract.csv", sep='\t', encoding='utf-8')

## 2. Position of the References

In [294]:
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

In [296]:
document = open(case_list[0], 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

In [297]:
layout_text = []
for page in PDFPage.get_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    for element in layout:
        if isinstance(element, LTTextBoxHorizontal):
            layout_text.append(element.get_text())

In [298]:
len(layout_text)

266

In [311]:
print(layout_text[265])

17



In [334]:
patent_position = {}
for layer in range(len(layout_text)):
    # extract patent number in a pdf file, store in a list
    text_patent =list(get_patent(layout_text[layer]))
    # remove duplicated patent in the list
    patent_position.update({layer/len(layout_text): text_patent}) 

In [335]:
patent_position = {i:j for i,j in patent_position.items() if j != []}

In [336]:
patent_position

{0.22556390977443608: ['5,379,340',
  '5,436,857',
  '5,553,145',
  '5,798,733',
  '7,549,007'],
 0.23684210526315788: ['7,810,144'],
 0.2518796992481203: ['7,120,462'],
 0.30451127819548873: ['7,810,144', '7,120,462'],
 0.5413533834586466: ['5,379,340', '5,553,145'],
 0.706766917293233: ['5,436,857', '7,549,007'],
 0.9661654135338346: ['5,798,733']}

In [341]:
def get_patent_position(file):
    document = open(file, 'rb')
    #Create resource manager
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    
    layout_text = []
    for page in PDFPage.get_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBoxHorizontal):
                layout_text.append(element.get_text()) 
    
    patent_position = {}
    for layer in range(len(layout_text)):
        # extract patent number in a pdf file, store in a list
        text_patent =list(get_patent(layout_text[layer]))
        # remove duplicated patent in the list
        patent_position.update({layer/len(layout_text): text_patent}) 
    
    patent_position = {i:j for i,j in patent_position.items() if j != []}
    
    return patent_position
            

        

In [342]:
patent_position_list = []
for file in case_list:
    patent_and_position = get_patent_position(file)
    patent_position_list.append(patent_and_position)

In [343]:
len(patent_position_list)

9

In [344]:
patent_position_list[0]

{0.22556390977443608: ['5,379,340',
  '5,436,857',
  '5,553,145',
  '5,798,733',
  '7,549,007'],
 0.23684210526315788: ['7,810,144'],
 0.2518796992481203: ['7,120,462'],
 0.30451127819548873: ['7,810,144', '7,120,462'],
 0.5413533834586466: ['5,379,340', '5,553,145'],
 0.706766917293233: ['5,436,857', '7,549,007'],
 0.9661654135338346: ['5,798,733']}

In [346]:
pdf_name[0]

'001 - Intellectual Ventures I LLC v Motorola Mobility LLC'

In [347]:
patent_position_list[1]

{0.06878306878306878: ['2,196,083', '3,283,437', '3,309,635', '4,775,374'],
 0.3439153439153439: ['5,830,237'],
 0.38095238095238093: ['6,964,688', '7,291,182', '8,523,951'],
 0.455026455026455: ['5,830,237'],
 0.6666666666666666: ['6,964,688', '7,291,182', '8,523,951'],
 0.8941798941798942: ['6,964,688', '7,291,182']}

In [348]:
case_list[0]

'./case_test/001 - Intellectual Ventures I LLC v Motorola Mobility LLC.pdf'