In [1]:
# Import libraries
import bs4
import urllib.request
import urllib.error

In [2]:
# Open the url using urllib.request
data = urllib.request.urlopen('https://www.ecfr.gov/cgi-bin/text-idx?SID=9ad314bba8aea555bcc632cc09a9c728&mc=true&tpl=/ecfrbrowse/Title17/17cfrv1_02.tpl#0').read()
soup = bs4.BeautifulSoup(data, 'html.parser')
table = soup("a", {'class': 'tpl'})

In [3]:
# get url links from directory
href_tags = []
for i in range(4, len(table), 2):
    href = table[i].get("href")
    if (href is None):
        continue
    if not (href in href_tags):
        href_tags.append('https://www.ecfr.gov' + href)

In [4]:
# Retrieve all the headings and text from urls
text_total = []
idx_total = []
for i in href_tags:
    text = []
    idx = []
    r = urllib.request.urlopen(i).read()
    s = bs4.BeautifulSoup(r, 'html.parser')
    for h2 in s.find_all('h2'):
        heading = h2.get_text().replace('\xa0\xa0\xa0', '')
        text_j = ''
        for elem in h2.next_siblings:
            if (elem.name and elem.name.startswith('h')):
                break
            if (elem.name == 'p'):
                if (elem.get_text()):
                    text_j += elem.get_text() + '\n'
        if (text_j != ''):
            text.append(text_j)
            idx.append(heading)
    idx_total.append(idx)
    text_total.append(text)

In [66]:
# sample texts
text_total[4]

["(a) Affiliated person of a futures commission merchant means a person described in section 2(c)(2)(B)(i)(II)(cc)(BB) of the Act;\n(b) Aggregate retail forex assets means an amount of liquid assets held in accordance with §5.8 of this part;\n(c) Associated person of an affiliated person of a futures commission merchant means any natural person associated with an affiliated person of a futures commission merchant as a partner, officer or employee (or any natural person occupying a similar status or performing similar functions), in any capacity which involves:\n(1) The solicitation or acceptance of retail forex customers' orders (other than in a clerical capacity); or\n(2) The supervision of any person or persons so engaged;\n(d)(1) Commodity pool operator, for purposes of this part, means any person who operates or solicits funds, securities, or property for a pooled investment vehicle that is not an eligible contract participant as defined in section 1a(18) of the Act, and that engag

In [65]:
# sample headings
idx_total[4]

['§5.1Definitions.',
 '§5.2Prohibited transactions.',
 '§5.3Registration of persons engaged in retail forex transactions.',
 '§5.4Applicability of part 4 of this chapter to commodity pool operators and commodity trading advisors.',
 '§5.5Distribution of “Risk Disclosure Statement” by retail foreign exchange dealers, futures commission merchants and introducing brokers regarding retail forex transactions.',
 '§5.6Maintenance of minimum financial requirements by retail foreign exchange dealers and futures commission merchants offering or engaging in retail forex transactions.',
 '§5.7Minimum financial requirements for retail foreign exchange dealers and futures commission merchants offering or engaging in retail forex transactions.',
 '§5.8Aggregate retail forex assets.',
 '§5.9Security deposits for retail forex transactions.',
 '§5.10Risk assessment recordkeeping requirements for retail foreign exchange dealers.',
 '§5.11Risk assessment reporting requirements for retail foreign exchange

In [38]:
# parse the document using nlp
import spacy
s = ''
for i in range(len(text_total)):
    for j in range(len(text_total[i])):
        s += text_total[i][j]

nlp = spacy.load('en', disable=['parser', 'tagger'])
doc = nlp(s)

In [39]:
# calculate word counts in documents
from collections import Counter
import re
histogram = Counter()
for token in doc:
    lemma = token.lemma_.lower()
    if not (nlp.vocab[lemma].is_stop or token.pos_ == 'PUNCT' or token.pos_ == 'SPACE'):
        if re.match('[a-zA-Z]+$', lemma):
            histogram[lemma] += 1
    
import numpy as np

# convert histogram into two lists `lemmas_unsorted` and `counts_unsorted`
# where the i-th lemma in `lemmas_unsorted` has raw counts given by
# `counts_unsorted[i]`
lemmas_unsorted = []
counts_unsorted = []
for lemma, count in histogram.items():
    lemmas_unsorted.append(lemma)
    counts_unsorted.append(count)
counts_unsorted = np.array(counts_unsorted)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(10), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas_unsorted[idx].ljust(10), counts_unsorted[idx])
########################################################################

Rank  Lemma      Raw Count
1     commission 3111
2     swap       2640
3     section    2055
4     future     1883
5     shall      1638
6     person     1602
7     merchant   1489
8     paragraph  1304
9     commodity  1186
10    dealer     1114
11    trade      1078
12    pool       1044
13    customer   1035
14    act        1010
15    account    1009
16    exchange   935
17    require    857
18    include    854
19    organization 822
20    file       813
21    entity     740
22    provide    734
23    report     730
24    participant 723
25    clear      717
26    contract   709
27    transaction 694
28    registrant 681
29    pursuant   680
30    security   674
31    foreign    645
32    requirement 623
33    retail     612
34    b          610
35    market     603
36    fr         602
37    notice     593
38    major      592
39    rule       588
40    business   580
41    day        579
42    date       574
43    register   565
44    ii         564
45    information 562
46    r

In [59]:
# entities with ORG Label
c = Counter()
for ent in doc.ents:
    if (len(ent.lemma_) == 0):
        continue
    if (ent.label_ == 'ORG'):
        c[ent.lemma_] += 1

lemmas = []
counts = []
for lemma, count in c.items():
    lemmas.append(lemma)
    counts.append(count)
counts_unsorted = np.array(counts)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(50), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas[idx].ljust(50), counts_unsorted[idx])
########################################################################

Rank  Lemma                                              Raw Count
1     Commission                                         1112
2     the National Futures Association                   249
3     the Securities and Exchange Commission             142
4     State                                              51
5     Cleared Swaps Customer Collateral                  48
6     the Disclosure Document                            46
7     NFA                                                42
8     the Division of Enforcement                        39
9     Special Entity                                     33
10    Cleared Swaps Customers                            32
11    the Division of Swap Dealer                        32
12    Federal                                            26
13    the Proceedings Clerk                              26
14    CFR                                                25
15    Cleared Swaps Customer                             24
16    the Office of Managemen

In [60]:
# entities with LAW Label
c = Counter()
for ent in doc.ents:
    if (len(ent.lemma_) == 0):
        continue
    if (ent.label_ == 'LAW'):
        c[ent.lemma_] += 1

lemmas = []
counts = []
for lemma, count in c.items():
    lemmas.append(lemma)
    counts.append(count)
counts_unsorted = np.array(counts)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(50), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas[idx].ljust(50), counts_unsorted[idx])
########################################################################

Rank  Lemma                                              Raw Count
1     Act                                                156
2     the Securities Exchange Act                        84
3     the Commodity Exchange Act                         58
4     the Securities Act                                 27
5     the Administrative Law Judge                       18
6     the Employee Retirement Income Security Act        13
7     the Disclosure Document                            12
8     the Investment Company Act                         8
9     the Futures Trading Act                            6
10    the Investment Advisers Act                        5
11    Commission regulation                              4
12    Section 3 of                                       4
13    a Disclosure Document                              4
14    the Commodity Exchange Act and Commission regulation 4
15    section 5 of the Bank Holding Company Act          4
16    the Federal Reserve Act         

In [61]:
# entities with PERCENT label
c = Counter()
for ent in doc.ents:
    if (len(ent.lemma_) == 0):
        continue
    if (ent.label_ == 'PERCENT'):
        c[ent.lemma_] += 1

lemmas = []
counts = []
for lemma, count in c.items():
    lemmas.append(lemma)
    counts.append(count)
counts_unsorted = np.array(counts)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(50), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas[idx].ljust(50), counts_unsorted[idx])
########################################################################

Rank  Lemma                                              Raw Count
1     ten percent                                        21
2     120 percent                                        19
3     50 percent                                         11
4     25 percent                                         7
5     20 percent                                         6
6     five percent                                       6
7     little than five percent                           6
8     10 percent                                         6
9     30 percent                                         4
10    much than 50 percent                               4
11    100 percent                                        4
12    110 percent                                        4
13    little than 10 percent                             3
14    200 percent                                        3
15    5 %                                                3
16    little than 30 percent                 

In [62]:
# entities with MONEY label
c = Counter()
for ent in doc.ents:
    if (len(ent.lemma_) == 0):
        continue
    if (ent.label_ == 'MONEY'):
        c[ent.lemma_] += 1

lemmas = []
counts = []
for lemma, count in c.items():
    lemmas.append(lemma)
    counts.append(count)
counts_unsorted = np.array(counts)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(50), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas[idx].ljust(50), counts_unsorted[idx])
########################################################################

Rank  Lemma                                              Raw Count
1     5,000,000                                          9
2     $ 1 billion                                        7
3     500,000                                            5
4     31 +                                               4
5     $ 2 billion                                        4
6     20,000,000                                         3
7     31 + 28 +                                          3
8     little than $ 1 billion                            3
9     little than $ 5,000,000                            2
10    little than $ 6,250,000                            2
11    $ 5 million                                        2
12    $ 8 billion                                        2
13    50,000                                             2
14    $ 700 million                                      2
15    at little $ 1 billion                              2
16    1,000,000                                 

In [63]:
# entities with PERSON label
c = Counter()
for ent in doc.ents:
    if (len(ent.lemma_) == 0):
        continue
    if (ent.label_ == 'PERSON'):
        c[ent.lemma_] += 1

lemmas = []
counts = []
for lemma, count in c.items():
    lemmas.append(lemma)
    counts.append(count)
counts_unsorted = np.array(counts)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(50), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas[idx].ljust(50), counts_unsorted[idx])
########################################################################

Rank  Lemma                                              Raw Count
1     Provided                                           126
2     Intermediary Oversight                             37
3     c)(1                                               33
4     U.S.C.                                             28
5     a)(1                                               20
6     g)(1                                               13
7     Cleared Swaps                                      11
8     e)(1                                               10
9     April                                              9
10    Material Affiliated                                8
11    c)(5)(x                                            7
12    Rule 240.15c3 - 1(c)(2)(vi                         7
13    U.S.C. 78c(a)(12                                   6
14    2(a)(1                                             5
15    a)(2)(viii)(A)(1                                   4
16    b)(5                             

In [64]:
# entities with DATE label
c = Counter()
for ent in doc.ents:
    if (len(ent.lemma_) == 0):
        continue
    if (ent.label_ == 'DATE'):
        c[ent.lemma_] += 1

lemmas = []
counts = []
for lemma, count in c.items():
    lemmas.append(lemma)
    counts.append(count)
counts_unsorted = np.array(counts)  # to allow for fancy numpy indexing

# compute the indices that would sort the list `counts_unsorted` in
# decreasing order (which is why there's a "::-1")
sort_indices = np.argsort(counts_unsorted)[::-1]
sorted_counts = counts_unsorted[sort_indices]  # actually for later problem subparts

print('Rank'.ljust(5), 'Lemma'.ljust(50), 'Raw Count')
for rank, idx in enumerate(sort_indices[:50]):
    print(str(rank + 1).ljust(5), lemmas[idx].ljust(50), counts_unsorted[idx])
########################################################################

Rank  Lemma                                              Raw Count
1     annual                                             91
2     1934                                               84
3     daily                                              57
4     monthly                                            44
5     fiscal year                                        43
6     the day                                            37
7     November 2 , 2012                                  33
8     thirty day                                         30
9     month                                              30
10    September 10 , 2010                                25
11    day                                                25
12    1940                                               24
13    1933                                               23
14    1974                                               21
15    77                                                 18
16    2012                       

In [None]:
# text similarity
text1 = 'more than 20 percent'
text2 = 'more than 10 percent'
