In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import glob
import simplejson as json
import re

In [2]:
def read_pdf(resume_file): 
    resource_mgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(resource_mgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_mgr, device)
    fp = open(resume_file, 'rb')
    maxpages = 0
    caching = True
    pagenos = set()
    
    for page in PDFPage.get_pages(fp, pagenos, 
                                  maxpages=maxpages, 
                                  caching=caching, 
                                  check_extractable=True):
        interpreter.process_page(page)

    result_str = retstr.getvalue()
    
    fp.close()
    device.close()
    retstr.close()
    
    return result_str

In [3]:
case_list = glob.glob('./case_test/*.pdf')

In [4]:
case_list

['./case_test/001 - Intellectual Ventures I LLC v Motorola Mobility LLC.pdf',
 './case_test/001 - In re Jobdiva Inc.pdf',
 './case_test/03 - Ashley Furniture Industries Inc v US.pdf']

In [5]:
key = []
value = []
for case in case_list:
    key.append(case.replace('./case_test/', '').replace('.pdf', ''))
    value.append(read_pdf(case))

In [6]:
key

['001 - Intellectual Ventures I LLC v Motorola Mobility LLC',
 '001 - In re Jobdiva Inc',
 '03 - Ashley Furniture Industries Inc v US']

# Extract Citation

In [None]:
import lexnlp.extract.en.citations
import pandas as pd

In [None]:
for case_text in value:
    text = case_text.decode("utf-8") 
    cite_dict = list(lexnlp.extract.en.citations.get_citations(text, 
                                                              return_source=False,
                                                              as_dict=True))
    for cite in cite_dict:
        del cite['page2']
        del cite['court']
    text_citation_df = pd.DataFrame.from_dict(cite_dict, dtype='str')
    
    

In [7]:
text = value[0].decode("utf-8") 

In [16]:
import lexnlp.extract.en.citations

text_citation =list(lexnlp.extract.en.citations.get_citations(text, 
                                                              return_source=False,
                                                              as_dict=True))

In [21]:
for cite in text_citation:
    del cite['page2']
    del cite['court']

{'volume': 870,
 'reporter': 'F.3d',
 'reporter_full_name': 'Federal Reporter',
 'page': 1320,
 'year': 2017}

In [48]:
import pandas as pd

In [61]:
text_citation_df = pd.DataFrame.from_dict(text_citation, dtype='str')

In [68]:
text_citation_df = text_citation_df.assign(cite_str = text_citation_df["volume"]+" "+
                                          text_citation_df["reporter"]+ " "+
                                          text_citation_df["page"]+ " ")

In [73]:
cite_str_text0 = list(text_citation_df["volume"]+" "+
                                          text_citation_df["reporter"]+ " "+
                                          text_citation_df["page"])

In [75]:
text_citation_df 

Unnamed: 0,page,reporter,reporter_full_name,volume,year,cite_str
0,1320,F.3d,Federal Reporter,870,2017.0,870 F.3d 1320
1,1320,F.3d,Federal Reporter,870,2017.0,870 F.3d 1320
2,1320,F.3d,Federal Reporter,870,2017.0,870 F.3d 1320
3,1197,F.3d,Federal Reporter,626,2010.0,626 F.3d 1197
4,447,F.3d,Federal Reporter,582,2009.0,582 F.3d 447
5,1336,F.3d,Federal Reporter,598,2010.0,598 F.3d 1336
6,1345,F.3d,Federal Reporter,659,,659 F.3d 1345
7,1320,F.3d,Federal Reporter,870,2016.0,870 F.3d 1320
8,1303,F.3d,Federal Reporter,415,2005.0,415 F.3d 1303
9,1340,F.3d,Federal Reporter,157,1998.0,157 F.3d 1340


In [22]:
text_citation_list = list(text_citation)

In [30]:
text_citation_list =list(lexnlp.extract.en.citations.get_citations(text, return_source=False,as_dict=False))

In [21]:
import lexnlp.extract.en.dates



In [22]:
print(list(lexnlp.extract.en.dates.get_dates(text)))

[datetime.date(2017, 9, 13), datetime.date(2010, 10, 5), datetime.date(1997, 11, 13), datetime.date(2012, 1, 13), datetime.date(2015, 3, 24), datetime.date(2019, 12, 1), datetime.date(2017, 3, 7)]


In [36]:
import lexnlp.extract.en.courts

In [40]:
import lexnlp.extract.en.dict_entities

In [30]:
import pandas

In [31]:
court_df = pandas.read_csv("https://raw.githubusercontent.com/LexPredict/lexpredict-legal-dictionary/1.0.5/en/legal/us_courts.csv")

In [43]:
court_config_data = []
for _, row in court_df.iterrows():
    c = lexnlp.extract.en.dict_entities.entity_config(row["Court ID"], row["Court Name"], 0, row["Alias"].split(";") if not pandas.isnull(row["Alias"]) else [])
    court_config_data.append(c)


In [44]:
for entity, alias in lexnlp.extract.en.courts.get_courts(text, court_config_data):
    print("entity=", entity)
    print("alias=", alias)

entity= (14, 'Federal Circuit', 0, [('Federal Circuit', None, False, None), ('Fed. Cir.', None, False, None)])
alias= ('Federal Circuit', None, False, None)
entity= (117, 'United States Court of Appeals for the Federal Circuit', 0, [('United States Court of Appeals for the Federal Circuit', None, False, None), ('Federal Circuit', None, False, None), (' Fed. Cir.', None, False, None), ('C.A.F.C.', None, False, None), (' CAFC', None, False, None)])
alias= ('Federal Circuit', None, False, None)
entity= (29, 'District of Delaware', 0, [('District of Delaware', None, False, None), ('D. Del.', None, False, None)])
alias= ('District of Delaware', None, False, None)
entity= (120, 'District of Delaware', 0, [('District of Delaware', None, False, None), ('Bankr. D. Del.', None, False, None)])
alias= ('District of Delaware', None, False, None)
entity= (29, 'District of Delaware', 0, [('District of Delaware', None, False, None), ('D. Del.', None, False, None)])
alias= ('District of Delaware', None