In [1]:
import bs4
import lxml
import pandas as pd
from flashtext import KeywordProcessor

In [2]:
def rule_to_dict(rule,_type):
    '''
    Gets a rule, and its type (rule/prorule), return a dictionary with the metadata we need and list of 
    paragraphras
    '''
    preamble = rule.PREAMB
    paragraphs = list(filter(lambda x: len(x)>5,rule.text.split('\n')))
    return {
        'type':_type,
        'agency':preamble.AGENCY.text,
        'title':preamble.SUBJECT.text,
        'doc_id':rule.FRDOC.text , #Use this as an aggregation code
        'paragraphs':paragraphs
    }

In [3]:

def rule_results_to_examples(rr):
    '''
    Given a rule, return a list of examples (in lighttag format)
    '''
    paragraphs = rr.pop('paragraphs') #Pop it so we don't append the paragraphs on each example
    results =[] 
    for num,p in enumerate(paragraphs):
        res = {'content':p,'order_code':num} #use doc_id as aggregation code, order_code is the order the ps appeared
        res.update(rr)## add the metadata from the document 
        results.append(res) #append it to the list of results
    return results
        
        

In [4]:
def fr_file_to_example_list(file_path):
    soup =bs4.BeautifulSoup(open(file_path),"lxml-xml")
    rules = soup.FEDREG.RULES.findAll('RULE')
    proposedRules = soup.FEDREG.PRORULES.findAll("PRORULE")
    ruleResults = map(lambda x: rule_to_dict(x,_type='rule'),rules)
    proRuleResults  = map(lambda x: rule_to_dict(x,_type='prorule'),proposedRules)
    results = list(ruleResults)+list(proRuleResults)
    examples = sum(map(rule_results_to_examples,results),[])
    return examples



In [5]:
import os
data =[]
base ='./data/bulk-fr/'
error_paths = []
for yr in os.listdir(base):
    yrdir = os.path.join(base,yr)
    for mnth in os.listdir(yrdir):
        if len(mnth)==2:
            pb = os.path.join(yrdir,mnth)
            print(mnth)

            for f in os.listdir(pb):
                if f.endswith('xml'):
                    path = os.path.join(pb,f)
                    try:
                        data += fr_file_to_example_list(path)
                    except:
                        error_paths.append(path)
                

11
10
02
09
06
07
08
04
01
05
12
03
11
10
02
09
06
07
08
04
01
05
12
03
11
10
02
09
06
07
08
04
01
05
12
03
10
02
09
06
07
08
04
01
05
03


In [6]:
import re
captializedTermReg = re.compile('([A-Z][a-z]+?\s){2,}([A-Z][a-z]+)')
m = captializedTermReg.match("Tal Pery Ate Big Food In")
KW = KeywordProcessor(case_sensitive=False)
KWG =KeywordProcessor(case_sensitive=False)
phrease_terms = ['treaty','convention','agreement',]
other_terms =['bilateral agreement','international agreement','executive agreement',
        'multilateral agreement','law of nations','law of armed conflict','international humanitarian law',
         'international custom','state practice','opinio juris','sense of legal obligation'
        ]

KW.add_keywords_from_list(phrease_terms,)
KWG.add_keywords_from_list(other_terms)
def find_all(text):
    res= []
    for m in captializedTermReg.finditer(text):
        t = KW.extract_keywords(m.group(0).lower())
        if t:
            res.append({'phrase':m.group(0),'key':t[0],'start':m.start(),'end':m.end()})
    for term,start,end in KWG.extract_keywords(text,span_info=True):
        res.append({'phrase':term,'key':term,'start':start,'end':end})
    return res
    
    


    

In [7]:
KW.extract_keywords("My treaty is here",span_info=True)

[('treaty', 3, 9)]

In [8]:
def attach_or_none(item):
    ress = find_all(item['content'])
    if ress:
        for res in ress:
            res.update(item)
        return ress
    else:
        return None
matches = sum(filter(lambda x:x,map(attach_or_none,data)),[])
len(data),len(matches)

(2369805, 3198)

In [9]:
pd.DataFrame(matches)

Unnamed: 0,agency,content,doc_id,end,key,order_code,phrase,start,title,type
0,DEPARTMENT OF COMMERCE,The Treaty between the Government of the Unite...,[FR Doc. 2017-24715 Filed 11-14-17; 8:45 am],250,treaty,16,Pacific Salmon Treaty Act,225,Fraser River Sockeye and Pink Salmon Fisheries...,rule
1,DEPARTMENT OF COMMERCE,Treaty Indian Fishery,[FR Doc. 2017-24715 Filed 11-14-17; 8:45 am],21,treaty,22,Treaty Indian Fishery,0,Fraser River Sockeye and Pink Salmon Fisheries...,rule
2,DEPARTMENT OF COMMERCE,Treaty Indian Fishery,[FR Doc. 2017-24715 Filed 11-14-17; 8:45 am],21,treaty,26,Treaty Indian Fishery,0,Fraser River Sockeye and Pink Salmon Fisheries...,rule
3,DEPARTMENT OF COMMERCE,Treaty Indian Fishery,[FR Doc. 2017-24715 Filed 11-14-17; 8:45 am],21,treaty,32,Treaty Indian Fishery,0,Fraser River Sockeye and Pink Salmon Fisheries...,rule
4,DEPARTMENT OF COMMERCE,Treaty Indian Fishery,[FR Doc. 2017-24715 Filed 11-14-17; 8:45 am],21,treaty,41,Treaty Indian Fishery,0,Fraser River Sockeye and Pink Salmon Fisheries...,rule
5,DEPARTMENT OF COMMERCE,Treaty Indian Fishery,[FR Doc. 2017-24715 Filed 11-14-17; 8:45 am],21,treaty,52,Treaty Indian Fishery,0,Fraser River Sockeye and Pink Salmon Fisheries...,rule
6,DEPARTMENT OF THE INTERIOR,The Migratory Bird Treaty Act of 1918 (Act) (1...,[FR Doc. 2017-24117 Filed 11-3-17; 8:45 am],29,treaty,22,The Migratory Bird Treaty Act,0,Migratory Bird Hunting; Approval of Corrosion-...,rule
7,DEPARTMENT OF THE INTERIOR,"Service Response: As we stated above, the Migr...",[FR Doc. 2017-24117 Filed 11-3-17; 8:45 am],67,treaty,49,Migratory Bird Treaty Act,42,Migratory Bird Hunting; Approval of Corrosion-...,rule
8,DEPARTMENT OF TRANSPORTATION,This product has been approved by the aviation...,[FR Doc. 2017-23990 Filed 11-3-17; 8:45 am],164,bilateral agreement,40,bilateral agreement,145,Airworthiness Directives; Fokker Services B.V....,prorule
9,DEPARTMENT OF TRANSPORTATION,This product has been approved by the aviation...,[FR Doc. 2017-23808 Filed 11-3-17; 8:45 am],164,bilateral agreement,38,bilateral agreement,145,Airworthiness Directives; Viking Air Limited A...,prorule


In [10]:
S = pd.DataFrame(matches)
len(S.drop_duplicates('content'))

1818

In [11]:
S.drop_duplicates('content').to_json('~/dev/lighttag/sample_data/fedreg.json',orient='records')

In [45]:
S.key.unique()

array(['treaty', 'bilateral agreement', 'agreement', 'convention',
       'international agreement', 'multilateral agreement',
       'executive agreement', 'state practice',
       'international humanitarian law'], dtype=object)