# Create Edge and  Node List

1. Extract Court
2. Create a edge list of the citation network
3. Create a node list contains the case code, name and court

### Read Case text data

In [1]:
import json
data = json.load(open('full_name_text.json', 'r'))
file_name = data['file_name']
raw_text = data['raw_text']

In [2]:
len(file_name)

854

In [3]:
len(raw_text)

854

## 1. Extract Court

In [4]:
from lexnlp.extract.en.dict_entities import get_entity_name, entity_config
import lexnlp.extract.en.courts
import pandas

In [5]:
court_df = pandas.read_csv("https://raw.githubusercontent.com/LexPredict/lexpredict-legal-dictionary/1.0.5/en/legal/us_courts.csv")

In [7]:
# Create config objects
court_config_data = []
for _, row in court_df.iterrows():
    c = entity_config(row["Court ID"],row["Court Name"],row["Court Type"])
    court_config_data.append(c)


In [8]:
def get_court(text):
    text_court =lexnlp.extract.en.courts.get_courts(text, court_config_data)
    court_in_pdf = []   
    for entity in text_court:
        court_in_pdf.append(entity[0][1:3])
    try:
        return court_in_pdf[1]
    except:
        return (None, None)   

In [9]:
court_list = []
court_type = []
for raw in raw_text:   
    court = get_court(raw)[0]
    court_list.append(court)
    ctype = get_court(raw)[1]
    court_type.append(ctype)


In [10]:
len(court_list)

854

In [11]:
len(court_type)

854

In [12]:
court_na = []
for i in range(len(file_name)):
    if court_list[i] == None: 
        court_na.append(i)       

In [18]:
len(court_na)

27

## 2. Create a edge list of the citation network

In [26]:
import re
# import a dictionary contains court information
from reporters_db import EDITIONS
for key in list(EDITIONS.keys()):
    EDITIONS[key.replace(" ", "")]=EDITIONS[key].replace(" ", "")
    
# regular expression to describe citation pattern
CITATION_PTN = r"""
(?:[\s,:\(]|^)
(
(\d+)\s+
({reporters})(\s|[a-z])+
(\d+)
)
""".format(reporters='|'.join([re.escape(i) for i in EDITIONS]))
CITATION_PTN_RE = re.compile(CITATION_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)


In [27]:
def get_citations(raw_text):
    result = CITATION_PTN_RE.findall(raw_text)
    citations = []
    for cite in result:
        the_cite = cite[0].replace("at ", "").replace("  "," ").replace("  "," ").replace("\n", " ")
        if len(the_cite)<=30:
            citations.append(the_cite)
    # remove duplicated citations
    citations = list(dict.fromkeys(citations))
    return citations

In [28]:
len(raw_text)

854

In [29]:
len(file_name)

854

In [30]:
import pandas as pd

In [31]:
cite_dict = {}
for i in range(len(file_name)):
    cite_dict[file_name[i]] = get_citations(raw_text[i])

In [32]:
# A list contain the case number of the PDF file
case_number = []
for i in range(len(file_name)):
    # Case number is the first citation number in the citation list
    case_number.append(cite_dict[file_name[i]][0])

In [33]:
cite_number_dict = {}
for i in range(len(file_name)):
    cite_number_dict[case_number[i]] = cite_dict[file_name[i]]

In [34]:
for key in case_number:
    cite_number_dict[key] = cite_number_dict[key][1:]

In [35]:
len(case_number)

854

In [36]:
len(cite_number_dict)

854

In [40]:
cite_df = pd.DataFrame.from_dict(cite_number_dict, orient='index')

In [41]:
cite_df['file']= cite_df.index

In [42]:
cite_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,431,432,433,434,435,436,437,438,439,file
725 F.3d 1356,598 F.3d 1294,427 F.3d 1375,598 F.3d 1308,605 F.3d 1330,212 F.3d 1272,699 F.3d 1340,679 F.3d 1372,699 F.3d 1348,699 F.3d 1349,713 F.2d 1530,...,,,,,,,,,,725 F.3d 1356
878 F.3d 1041,649 F.3d 1276,323 F.3d 1354,649 F.3d 1290,269 F.3d 1369,290 U.S. 240,54 S.Ct. 146,78 L.Ed. 293,322 U.S. 238,64 S.Ct. 997,88 L.Ed. 1250,...,,,,,,,,,,878 F.3d 1041
857 F.3d 1323,396 F.3d 1369,343 F.3d 1340,746 F.3d 995,637 F.3d 1344,695 F.3d 1247,281 F.3d 1261,992 F.2d 1204,476 F.2d 1357,992 F.2d 1207,2012 WL 423807,...,,,,,,,,,,857 F.3d 1323
809 F.3d 1295,2014 WL 3805817,318 F.Supp. 1116,299 F.3d 1336,773 F.3d 1201,767 F.3d 1308,111 U.S. 120,4 S.Ct. 291,28 L.Ed. 371,802 F.3d 1283,767 F.3d 1328,...,,,,,,,,,,809 F.3d 1295
709 F.3d 1124,2011 WL 1990748,2012 WL 254026,2012 WL 555092,558 F.3d 1368,641 F.3d 1352,628 F.3d 1143,960 F.2d 1020,839 F.2d 1544,960 F.2d 1042,90 F.3d 479,...,,,,,,,,,,709 F.3d 1124


In [43]:
citation_df= pd.melt(cite_df,id_vars=['file']). \
            drop(['variable'], axis=1).dropna(). \
            rename(index=str, columns={"file": "from",
                                      "value":"to"})

In [48]:
citation_df.to_csv("cite_egde.csv", sep=',', encoding='utf-8', index=False)

## 3. Create a node list contains the case code, name and court

In [45]:
Nodes = pd.DataFrame(
    {'case_code': case_number,
     'case_name': file_name,
     'court_from': court_list,
     'court_type':court_type,
    })


In [46]:
Nodes.head()

Unnamed: 0,case_code,case_name,court_from,court_type
0,725 F.3d 1356,Apple Inc v International Trade Com'n,International Trade Commission,Federal Court
1,878 F.3d 1041,Regeneron Pharmaceuticals Inc v Merus NV,Southern District of New York,Federal District Court
2,857 F.3d 1323,Joseph Phelps Vineyards LLC v Fairmont Holding...,Trademark Trial and Appeal Board,Federal Court
3,809 F.3d 1295,Commonwealth Scientific and Indus Research Org...,Eastern District of Texas,Federal District Court
4,709 F.3d 1124,Radio Systems Corp v Lalor,Western District of Washington,Federal District Court


In [47]:
Nodes.to_csv("cite_node.csv", sep='\t', encoding='utf-8',index=False)