Skip to content

Commit

Permalink
Feature/soa ayesha ah (#170)
Browse files Browse the repository at this point in the history
* Statements feature added with sql dump data

Co-authored-by: Ayesha Malika Khan <ayesha.malika.khan@gmail.com>
  • Loading branch information
aih and ayeshamk committed Feb 12, 2021
1 parent c248381 commit 8f6d08d
Show file tree
Hide file tree
Showing 682 changed files with 335 additions and 14 deletions.
40 changes: 39 additions & 1 deletion README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -194,4 +194,42 @@ image::media/bill-similarity-by-section-115hr4733.png[SectionBySectionBills,300,
.Bill-to-bill Similarity
[#img-bill-to-bill-similarity]
[caption="Figure 3: " ]
image::media/bill-similarity-compare.png[SectionBySectionBills,300,200]
image::media/bill-similarity-compare.png[SectionBySectionBills,300,200]
## Statement of Administration Policy

The metadata for Statement of Administration Policy section has been scraped and stored in json files. The pdfs are stored in the media directory.

### Load Statement of Administration Policy Data to Database

- activate the virtualenv and go to `(flatgov) ~/.../FlatGov/server_py$`
```bash
$ cd ~/.../FlatGov/server_py
$ source .venv/bin/activate

```

- Go to `(flatgov) ~/.../FlatGov/server_py/flatgov$`
```bash

$ cd flatgov

```

- Apply all migrations
```bash

./manage.py makemigrations
./manage.py migrate

```

- Load Statement of Administration Policy data
```bash

./manage.py load_statements

```
66 changes: 66 additions & 0 deletions scrapers/scrape_statements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from bs4 import BeautifulSoup as bs4
import requests
import re
import json

url = 'https://www.whitehouse.gov/omb/statements-of-administration-policy/'

response = requests.get(url)

soup = bs4(response.text, 'html.parser')

con = soup.find('div', {'class': 'page-content__content editor'})

ps = con.findAllNext('p')[1:]
print(len(ps))
data = []
count = 0
new = {
'url': '',
'link': '',
'link_text': '',
'bill_number': '',
'date_issued': '',
'congress': '',
}


for i in range(len(ps)):

if ps[i].text.find('Administration Policy on') != -1:

new['date_issued'] = ps[i].text.split('on')[-1]
# print(new, count)
# data.append(ps[i])
pass
else:

if ps[i].text.find('Administration Policy on') != -1:
new['date_issued'] = ps[i].text.split('on')[-1]
else:
count+=1
new['link'] = ps[i].find('a', href=True)['href']
# print(ps[i].find('a', href=True)['href'])
a_text = ps[i].find('a').text
new['link_text'] = a_text
q = re.sub(r'\s', '', a_text.split('–')[0])
qw = re.sub(r'\.', '', q)
# print()
new['bill_number'] = qw.split('—')[0]

print(new['date_issued'][-2:])
if new['date_issued'][-2:] in ['15', '16']:
new['congress'] = '114'
elif new['date_issued'][-2:] in ['17', '18']:
new['congress'] = '115'
elif new['date_issued'][-2:] in ['19', '20']:
new['congress'] = '116'
elif new['date_issued'][-2:] in ['21', '22']:
new['congress'] = '117'
new['url'] = url
with open('data.json', 'a+') as meta_write_file:
json.dump(new, meta_write_file, indent=4)


print(count)
print(new)
16 changes: 15 additions & 1 deletion server_py/flatgov/bills/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 3.1 on 2021-02-11 18:07
# Generated by Django 3.1 on 2021-02-12 18:06

from django.db import migrations, models

Expand All @@ -24,6 +24,20 @@ class Migration(migrations.Migration):
('thomas_id', models.CharField(blank=True, max_length=50, null=True)),
],
),
migrations.CreateModel(
name='Statement',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('bill_number', models.CharField(max_length=127)),
('bill_id', models.CharField(blank=True, max_length=127, null=True)),
('bill_title', models.TextField(blank=True, null=True)),
('congress', models.CharField(max_length=10)),
('date_issued', models.CharField(max_length=35)),
('permanent_pdf_link', models.FileField(blank=True, null=True, upload_to='statements/')),
('original_pdf_link', models.CharField(blank=True, max_length=255, null=True)),
('date_fetched', models.DateTimeField(auto_now_add=True)),
],
),
migrations.CreateModel(
name='Cosponsor',
fields=[
Expand Down
16 changes: 16 additions & 0 deletions server_py/flatgov/bills/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,19 @@ class Sponsor(models.Model):
def __str__(self):
return self.name



class Statement(models.Model):
bill_number = models.CharField(max_length=127)
bill_id = models.CharField(max_length=127, null=True, blank=True)
bill_title = models.TextField(null=True, blank=True)
congress = models.CharField(max_length=10)
date_issued = models.CharField(max_length=35)
permanent_pdf_link = models.FileField(upload_to='statements/', blank=True, null=True)
original_pdf_link = models.CharField(max_length=255, null=True, blank=True)

date_fetched = models.DateTimeField(auto_now_add=True)


def __str__(self):
return f'{self.bill_number} - {self.permanent_pdf_link}'
9 changes: 8 additions & 1 deletion server_py/flatgov/bills/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

from common.elastic_load import getSimilarSections, moreLikeThis, getResultBillnumbers, getInnerResults

from bills.models import Bill, Cosponsor
from bills.models import Bill, Cosponsor, Statement
from bills.tables import RelatedBillTable

from bills.serializers import RelatedBillSerializer, CosponsorSerializer

def deep_get(dictionary: Dict, *keys):
Expand Down Expand Up @@ -256,11 +257,17 @@ def get_qs_related_bill(self):
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['cosponsors'] = self.get_cosponsors()
context['statements'] = self.object.get_related_statements()
context['related_bills'] = self.get_related_bills()
context['similar_bills'] = self.object.get_similar_bills
context['es_similarity'] = self.object.es_similarity
return context


def get_related_statements(self, **kwargs):
slug = self.kwargs['slug']
return Statement.objects.filter(bill_number__iexact=slug[3:]).filter(congress__iexact=slug[:3])

def get_related_bills(self):
qs = self.get_qs_related_bill()
serializer = RelatedBillSerializer(
Expand Down
112 changes: 112 additions & 0 deletions server_py/flatgov/common/cbo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
#
# Command line template from https://gist.githubusercontent.com/opie4624/3896526/raw/3aff2ad7030a74ce26f9fcf80791ae0396d84f18/commandline.py

import sys, os, argparse, logging, re, json, gzip
from typing import Dict
from functools import reduce

from common import constants, utils
from bills.models import Bill, Sponsor

import xmltodict


logging.basicConfig(filename='billdata.log', filemode='w', level='INFO')
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler(sys.stdout))

def logName(dirName: str, fileName: str):
"""
Prints the name provided (path to a file to be processed) to the log.
Args:
fname (str): path of file to be processed
"""

logger.info('In directory: \t%s' % dirName)
logger.info('Processing: \t%s' % fileName)

def getBillFromDirname(dirName: str) -> str:
"""
Dirname is of the form ../../congress/data/116/bills/s/s245
Want to retrieve the part that is 116/bills/s/s3583
And return 116s3583
Args:
dirName (str): path to match
Returns:
str: name of the bill (billCongressTypeNumber)
"""
m = constants.BILL_DIR_REGEX_COMPILED.match(dirName)
if m and m.groups():
return ''.join(list(m.groups()))
else:
return ''


def getTopBillLevel(dirName: str):
"""
Get path for the top level of a bill, e.g. ../../congress/data/116/bills/hr/hr1
Args:
dirName (str): path to match
Returns:
[bool]: True if path is a top level (which will contain data.json); False otherwise
"""
dirName_parts = dirName.split('/')
return (re.match(r'[a-z]+[0-9]+', dirName_parts[-1]) is not None and dirName_parts[-3]=='bills')

def isDataXML(fileName: str) -> bool:
return fileName == 'fdsys_billstatus.xml'

def rec_cbo_item(cbo_items):
#print(cbo_items)
#quit()
print('-----')
for cbo_item in cbo_items:
if type(cbo_item) is not str:
#print(cbo_item)
rec_cbo_item(cbo_item)

else:
print(cbo_item)
print(cbo_items[cbo_item])


def collect_cbo_data_into_json(rootDir = constants.PATH_TO_CONGRESSDATA_DIR, processFile = logName, dirMatch = getTopBillLevel, fileMatch = isDataXML):
for dirName, subdirList, fileList in os.walk(rootDir):
if dirMatch(dirName):
#logger.info('Entering directory: %s' % dirName)
filteredFileList = [fitem for fitem in fileList if fileMatch(fitem)]
for fname in filteredFileList:
#print(dirName+fname)
cbo_file = dirName+'/'+fname
with open(cbo_file) as xml_file:
data = xmltodict.parse(xml_file.read())
#print(data.keys())
bill_status = data['billStatus']
#print(bill_status.keys())
bill = bill_status['bill']
#print(bill.keys())
cbo_cost_estimates = bill['cboCostEstimates']
if cbo_cost_estimates:

cbo_items = cbo_cost_estimates['item']
#print('-------', cbo_items)
rec_cbo_item(cbo_items)
#for cbo_item in cbo_items:
#print('-----------', type(cbo_item))


#quit()
#processFile(dirName=dirName, fileName=fname)


def cbo():
collect_cbo_data_into_json()




Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from django.core.management.base import BaseCommand
from common.statements import load_statements


class Command(BaseCommand):
help = 'create bill data via billdata.py'

def handle(self, *args, **options):
load_statements()
13 changes: 13 additions & 0 deletions server_py/flatgov/common/statements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import json
import requests
from bills.models import Statement
import os

def load_statements():
print(Statement.objects.all().count())
Statement.objects.all().delete()

with open('dumped_statements.json', 'r') as f:
statements_data =json.loads(f.read())
print(len(statements_data))
os.system('./manage.py loaddata dumped_statements.json')
Binary file added server_py/flatgov/dump_statement.json.zip
Binary file not shown.
1 change: 1 addition & 0 deletions server_py/flatgov/dumped_statements.json

Large diffs are not rendered by default.

15 changes: 10 additions & 5 deletions server_py/flatgov/flatgov/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,18 @@
'django.contrib.staticfiles.finders.AppDirectoriesFinder',
]

STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles')

STATIC_URL = '/static/'

STATICFILES_DIRS = (
os.path.join(BASE_DIR, 'static'),
)

if not DEBUG:
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
else:
STATICFILES_DIRS = [
os.path.join(BASE_DIR, 'static')
]

MEDIA_URL = '/media/'
MEDIA_ROOT = BASE_DIR / "media"


DJANGO_TABLES2_TEMPLATE = os.path.join(BASE_DIR, 'templates/django_tables2/table.html')
Expand Down
5 changes: 3 additions & 2 deletions server_py/flatgov/flatgov/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
from django.contrib import admin
from django.views.generic import RedirectView
from django.urls import include, path, re_path

from django.conf import settings
from django.conf.urls.static import static
urlpatterns = [
re_path(r'^$', RedirectView.as_view(url='home/', permanent=False), name='index'),
path('bills/', include('bills.urls')),
path('home/', include('home.urls')),
path('uscongress/debug/', include('uscongress.urls')),
path('admin/', admin.site.urls),
]
]+ static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 8f6d08d

Please sign in to comment.