# Single PDF Extraction Pipeline

## Preamble

import packages to read PDF file

In [1]:
# import io (input and output); BytesIO encode string to byte object
from io import BytesIO
# extract all file name in a folder, for the convenience of reading PDF files
import glob
# re (regular expression) to find string with certain patterns
import re

In [2]:
# pdfminer to parse PDF file
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter

In [3]:
# obtation a list of all pdf file
case_list = glob.glob('./Cases/*.pdf')

In [66]:
# set the PDF file you want to read
file=case_list[60]

In [67]:
file

'./Cases/004 - Preston v Marathon Oil Co.pdf'

configure for the pdfminer package

## 1. Get text from the PDF file

In [68]:
def get_layout_text(file):
    
    #Create resource manager
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    document = open(file, 'rb')
    # each layer in a list
    layout_text = []
    # all the text in a string
    raw_text = str()
    
    for page in PDFPage.get_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBoxHorizontal):
                layout_text.append(element.get_text())
                raw_text = raw_text.__add__(element.get_text())
    
    document.close()
    device.close()
                
    return layout_text, raw_text

In [69]:
layout_text, raw_text = get_layout_text(file)

In [20]:
def print_layout_text(layout_text):
    for i in range(len(layout_text)):
        print('{} {} {}'.format('------------- Layer', i, " ---------------" ))
        print(layout_text[i])

In [71]:
print_layout_text(layout_text[0:10])

------------- Layer 0  ---------------
Preston v. Marathon Oil Co., 684 F.3d 1276 (2012)
34 IER Cases 11, 103 U.S.P.Q.2d 1353

------------- Layer 1  ---------------
684 F.3d 1276

------------- Layer 2  ---------------
United States Court of Appeals,

------------- Layer 3  ---------------
Federal Circuit.

------------- Layer 4  ---------------
Yale PRESTON, Plaintiff–Appellant,

------------- Layer 5  ---------------
v.

------------- Layer 6  ---------------
MARATHON OIL COMPANY and Thomas Smith, Defendants–Cross Appellants,

------------- Layer 7  ---------------
and

------------- Layer 8  ---------------
John Does 1–10, Defendants.

------------- Layer 9  ---------------
Nos. 2011–1013, 2011–1026.



## 2. Extract reference 

### 2.1 Citations

In [24]:
# import a dictionary contains court information
from reporters_db import EDITIONS
for key in list(EDITIONS.keys()):
    EDITIONS[key.replace(" ", "")]=EDITIONS[key].replace(" ", "")

In [25]:
# regular expression to describe citation pattern
CITATION_PTN = r"""
(?:[\s,:\(]|^)
(
(\d+)\s+
({reporters})(\s|[a-z])+
(\d+)
)
""".format(reporters='|'.join([re.escape(i) for i in EDITIONS]))
CITATION_PTN_RE = re.compile(CITATION_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

In [26]:
def get_citations(raw_text):
    result = CITATION_PTN_RE.findall(raw_text)
    citations = []
    for cite in result:
        the_cite = cite[0].replace("at ", "").replace("  "," ").replace("  "," ").replace("\n", " ")
        if len(the_cite)<=30:
            citations.append(the_cite)
    # remove duplicated citations
    citations = list(dict.fromkeys(citations))
    return citations

In [27]:
citations = get_citations(raw_text)

In [28]:
citations[0:5]

['813 F.3d 1368',
 '2012 WL 1188903',
 '2012 WL 2838735',
 '2013 WL 6491070',
 '135 S.Ct. 831']

### 2.2 Regulations

In [29]:
import lexnlp.extract.en.regulations

In [30]:
def get_regulations(raw_text):
    result =list(lexnlp.extract.en.regulations.get_regulations(raw_text, 
                                                                        return_source=False,
                                                                        as_dict=True))
    
    regulations = []
    for reg in result:
        # sotre every regulation in the pdf file to this container
        regulations.append(reg['regulation_code'])
        
    # remove duplicated regulations in a file
    regulations = list(dict.fromkeys(regulations)) 
        
    return  regulations   

In [31]:
regulations = get_regulations(raw_text)

In [32]:
regulations

['28 USC § 1295']

### 2.3 US patent

In [117]:
from typing import Generator

In [121]:
import regex as re

In [143]:
US_PATENT = r"""
(?P<block1>[0-9]{1})[\,]?(?P<block2>[0-9]{3})[\,]?(?P<block3>[0-9]{3})
"""
RE_PATENT = re.compile(US_PATENT, re.IGNORECASE | re.UNICODE | re.DOTALL | re.VERBOSE)

In [145]:
def us_patent(text, return_sources=False) -> Generator: 
    # Iterate through all potential matches
    for match in RE_PATENT.finditer(text):
        # Get individual group matches
        captures = match.capturesdict()
        patent = "{block1},{block2},{block3}".format(block1=captures["block1"].pop(),
                                                  block2=captures["block2"].pop(),
                                                  block3=captures["block3"].pop(),
                                                  )

        if return_sources:
            yield patent, match.group()
        else:
            yield patent

In [146]:
def get_patent(text):
    patents =list(us_patent(text))
    # remove duplicated patent in the list
    result = list(dict.fromkeys(patents)) 
    return result

In [186]:
from fractions import Fraction

In [197]:
def get_position_patent(text_layer):
    patent_position = {}
    for i in range(len(text_layer)):
        # extract patent number in a pdf file, store in a list
        text_patent =get_patent(text_layer[i])
        # remove duplicated patent in the list
        patent_position.update({(i,len(text_layer)): text_patent}) 
    patent_position = {i:j for i,j in patent_position.items() if j != []}
    return patent_position
    

In [202]:
get_position_patent(pdf_layers[3])

{(38, 104): ['5,737,054', '6,012,811', '6,092,896'],
 (53, 104): ['5,737,054', '6,012,811', '6,092,896'],
 (93, 104): ['6,634,227']}

In [256]:
patent_position = {}
for i in range(len(pdf_layers[3])):
    # extract patent number in a pdf file, store in a list
    text_patent =get_patent(pdf_layers[3][i])
    # remove duplicated patent in the list
    patent_position.update({(i,len(pdf_layers[3])): text_patent}) 

In [257]:
patent_position = {i:j for i,j in patent_position.items() if j != []}

In [258]:
patent_position

{(38, 104): ['5,737,054', '6,012,811', '6,092,896'],
 (53, 104): ['5,737,054', '6,012,811', '6,092,896'],
 (93, 104): ['6,634,227']}

In [259]:
layer_number = []
pat_li = []
for key in list(patent_position.keys()):
    for pat in patent_position[key]:      
        pat_li.append(pat)
        layer_number.append(key)   

In [260]:
layer_number

[(38, 104), (38, 104), (38, 104), (53, 104), (53, 104), (53, 104), (93, 104)]

In [304]:
def array_position_patent(text_layer):
    patent_position = {}
    for i in range(len(text_layer)):
        # extract patent number in a pdf file, store in a list
        text_patent =get_patent(text_layer[i])
        # remove duplicated patent in the list
        patent_position.update({(i,len(text_layer)): text_patent}) 
    patent_position = {i:j for i,j in patent_position.items() if j != []}
    
    layer_number = []
    pat_li = []
    for key in list(patent_position.keys()):
        for pat in patent_position[key]:      
            pat_li.append(pat)
            layer_number.append(key)
    return layer_number, pat_li

In [307]:
array_position_patent(pdf_layers[3])[1]

['5,737,054',
 '6,012,811',
 '6,092,896',
 '5,737,054',
 '6,012,811',
 '6,092,896',
 '6,634,227']

### 2.4 date

In [33]:
import calendar

In [34]:
month_list = list(calendar.month_abbr[1:])+['Sept']+list(calendar.month_name[1:])

In [35]:
# regular expression to describe citation pattern
DATE_PTN = r"""
(?:[\s,:\(]|^)
(
({month})(\.*)\s*
(\d+)(,)\s*
(\d+)
)
""".format(month='|'.join([re.escape(i) for i in month_list]))
DATE_PTN_RE = re.compile(DATE_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

In [36]:
def get_date(raw_text):
    result = DATE_PTN_RE.findall(raw_text)
    case_date = the_date = result[0][0].replace(",", "").replace("\n", "").replace(".", "")
    
    return case_date

In [37]:
get_date(raw_text)

'Feb 22 2016'

## 3. Write to `json` and `csv` file

In [38]:
# obtation a list of all pdf file
case_list = glob.glob('./Cases/*.pdf')

In [72]:
# a list to store the name of the pdf
pdf_name = []
# a list to store the content of the pdf
pdf_layers = []

pdf_text = []

# a loop to read all the pdf and store their name and content to the respective list
for case in case_list:
    pdf_name.append(case.replace('./Cases/', '').replace('.pdf', ''))
    layout_text, raw_text = get_layout_text(case)
    pdf_layers.append(layout_text)
    pdf_text.append(raw_text)

In [91]:
import json

#### Raw text and layers to `json` file

In [96]:
PDF2TEXT = '{}{}{}'.format('pdf2text',len(pdf_name), '.json' )
# write text to a json file
json.dump({'file_name': pdf_name,
           'layouts': pdf_layers,
           'raw_text': pdf_text},
          open(PDF2TEXT, 'w'))

#### Count Frequency of ciations

In [45]:
import pandas as pd

In [79]:
cite_dict = {}
for i in range(len(pdf_name)):
    cite_dict[pdf_name[i]] = get_citations(pdf_text[i])


In [80]:
len(cite_dict)

159

In [81]:
cite_df = pd.DataFrame.from_dict(cite_dict, orient='index')
cite_df['file']= cite_df.index

In [100]:
citation_df= pd.melt(cite_df,id_vars=['file']). \
            drop(['variable'], axis=1).dropna(). \
            rename(index=str, columns={"value": "citaion"})
            
# ciation_df['file'] = ciation_df['file'].str.replace('./case_test/', '')                       

In [103]:
citation_df.groupby(['citaion']).count().nlargest(5, 'file').\
rename(index=str, columns={"file": "Times being cited"})

Unnamed: 0_level_0,Times being cited
citaion,Unnamed: 1_level_1
415 F.3d 1303,33
127 S.Ct. 1727,20
15 L.Ed.2d 545,20
167 L.Ed.2d 705,20
383 U.S. 1,20


In [None]:
# write dataframe to csv file
citation_df.to_csv("citation159.csv", sep='\t', encoding='utf-8')

#### Count Frequency of regulations

In [84]:
reg_dict = {}
for i in range(len(pdf_name)):
    reg_dict[pdf_name[i]] = get_regulations(pdf_text[i])


In [86]:
reg_dict_dropna = {i:j for i,j in reg_dict.items() if j != []}

In [87]:
reg_df = pd.DataFrame.from_dict(reg_dict_dropna, orient='index')
reg_df['file'] = reg_df.index

In [88]:
regulation_df= pd.melt(reg_df,id_vars=['file']). \
     drop(['variable'], axis=1).dropna(). \
     rename(index=str, columns={"value": "regulation"})
     
# regulation_df['file'] = regulation_df['file'].str.replace('./case_test/', '')                       

In [89]:
regulation_df.groupby(['regulation']).count().nlargest(5, 'file').\
rename(index=str, columns={"file": "Times being cited"})

Unnamed: 0_level_0,Times being cited
regulation,Unnamed: 1_level_1
28 USC § 1295,119
35 USC § 103,30
35 USC § 112,26
35 USC § 271,20
125 Stat. 284,17


In [107]:
# write dataframe to csv file
regulation_df.to_csv("regulation159.csv", sep='\t', encoding='utf-8')

#### Calculate all the references together

In [108]:
citation_df.rename(index=str, columns={"citaion":"ref"},inplace=True)
regulation_df.rename(index=str, columns={"regulation":"ref"},inplace=True)

In [109]:
ref_df = pd.concat([citation_df,regulation_df])

In [112]:
ref_df.groupby(['ref']).count().nlargest(10, 'file').\
rename(index=str, columns={"file": "Times being cited"})

Unnamed: 0_level_0,Times being cited
ref,Unnamed: 1_level_1
28 USC § 1295,119
415 F.3d 1303,33
35 USC § 103,30
35 USC § 112,26
127 S.Ct. 1727,20
15 L.Ed.2d 545,20
167 L.Ed.2d 705,20
35 USC § 271,20
383 U.S. 1,20
550 U.S. 398,20


#### Write references to `csv` file

In [113]:
ref_df.to_csv("reference159.csv", sep='\t', encoding='utf-8')

#### count patent frequenct

In [203]:
patent_dict = {}
for i in range(len(pdf_name)):
    patent_dict[pdf_name[i]] = get_patent(pdf_text[i])

In [204]:
pat_df = pd.DataFrame.from_dict(patent_dict, orient='index')
pat_df['file'] = pat_df.index

In [205]:
patent_df= pd.melt(pat_df,id_vars=['file']). \
     drop(['variable'], axis=1).dropna(). \
     rename(index=str, columns={"value": "patent"})   

In [208]:
patent_df.groupby(['patent']).count().nlargest(5, 'file').\
rename(index=str, columns={"file": "Times being cited"})

Unnamed: 0_level_0,Times being cited
patent,Unnamed: 1_level_1
1920033,2
2000000,2
3692319,2
5860973,2
6553350,2


In [310]:
position_dict = {}
for i in range(len(pdf_name)):
    position_dict[pdf_name[i]] = array_position_patent(pdf_layers[i])[0]

In [338]:
position_dict[pdf_name[0]][0:6]

[(11, 126), (11, 126), (11, 126), (40, 126), (40, 126), (40, 126)]

In [339]:
patent_dict[pdf_name[0]][0:6]

['1,188,903', '2,838,735', '6,491,070', '5,131,153', '5,261,009', '5,381,489']

In [318]:
patent_dict = {}
for i in range(len(pdf_name)):
    patent_dict[pdf_name[i]] = array_position_patent(pdf_layers[i])[1]

In [316]:
position_dict_dropna = {i:j for i,j in position_dict.items() if j != []}

In [320]:
patent_dict_dropna = {i:j for i,j in patent_dict.items() if j != []}

In [321]:
len(patent_dict_dropna )

139

In [322]:
pos_df = pd.DataFrame.from_dict(position_dict, orient='index')
pos_df['file'] = pos_df.index

In [328]:
position_df= pd.melt(pos_df,id_vars=['file']). \
     drop(['variable'], axis=1).dropna(). \
     rename(index=str, columns={"value": "position"})   

In [345]:
position_df.loc[position_df.file == pdf_name[0]][0:6]

Unnamed: 0,file,position
0,052 - Nuance Communications Inc v ABBYY USA So...,"(11, 126)"
159,052 - Nuance Communications Inc v ABBYY USA So...,"(11, 126)"
318,052 - Nuance Communications Inc v ABBYY USA So...,"(11, 126)"
477,052 - Nuance Communications Inc v ABBYY USA So...,"(40, 126)"
636,052 - Nuance Communications Inc v ABBYY USA So...,"(40, 126)"
795,052 - Nuance Communications Inc v ABBYY USA So...,"(40, 126)"


In [331]:
pat_df = pd.DataFrame.from_dict(patent_dict, orient='index')
pat_df['file'] = pat_df.index
patent_df= pd.melt(pat_df,id_vars=['file']). \
     drop(['variable'], axis=1).dropna(). \
     rename(index=str, columns={"value": "patent"})   

In [344]:
patent_df.loc[patent_df.file == pdf_name[0]][0:6]

Unnamed: 0,file,patent
0,052 - Nuance Communications Inc v ABBYY USA So...,1188903
159,052 - Nuance Communications Inc v ABBYY USA So...,2838735
318,052 - Nuance Communications Inc v ABBYY USA So...,6491070
477,052 - Nuance Communications Inc v ABBYY USA So...,5131153
636,052 - Nuance Communications Inc v ABBYY USA So...,5261009
795,052 - Nuance Communications Inc v ABBYY USA So...,5381489


In [352]:
patent_df =patent_df.assign(position = position_df['position'])

In [353]:
patent_df.head()

Unnamed: 0,file,patent,position
0,052 - Nuance Communications Inc v ABBYY USA So...,1188903,"(11, 126)"
1,021 - Intellectual Ventures I LLC v Erie Indem...,6236983,"(134, 296)"
3,021 - Aspex Eyewear Inc v Zenni Optical Inc,5737054,"(38, 104)"
4,056 - Soverain Software LLC v Newegg Inc,5715314,"(23, 83)"
5,100 - In re Geller,2365001,"(8, 109)"


#### write extracted dates to `csv` file

In [74]:
date_list = []
for text in pdf_text:
    try:
        date_list.append(get_date(text))
    except:
        date_list.append(None)


In [90]:
# check if extract the date for every pdf
for i in range(len(date_list)):
    if date_list[i] == None:
        print(i) 

In [114]:
metadata = pd.DataFrame(
    {'file_name': pdf_name,
     'dates':date_list
    })

In [1]:
metadata.to_csv("metadata159.csv", sep='\t', encoding='utf-8')

NameError: name 'metadata' is not defined