Skip to content

Commit

Permalink
Docs/celery update (#325)
Browse files Browse the repository at this point in the history
* Update celery tasks to use Go billmeta when available

* Document update to celery

* Index and calculate bill similarity from current congress
  • Loading branch information
aih committed Apr 23, 2021
1 parent e15411b commit b1dd627
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 53 deletions.
28 changes: 28 additions & 0 deletions README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,34 @@ $ cd /path/to/FlatGov/server_py
$ pip install -r requirements.txt
```

#### Installing pypy as virtualenv

The application has been tested and works with pypy on ubuntu:

1. Install pypy as a pyenv virtualenv, for example
```
pyenv install pypy3.7-7.3.4
pyenv virtualenv pypy3.7-7.3.4 pypy37flat
pyenv activate pypy37flat
```

2. Upgrade pip, if appropriate
`/home/ubuntu/.pyenv/versions/pypy3.7-7.3.4/envs/pypy37flat/bin/pypy3 -m pip install --upgrade pip`

3. It may be necessary to install C libraries to build lxml
`sudo apt-get install libxml2-dev libxslt-dev python-dev`


4. Install requirements
```
cd /path/to/FlatGov/server_py
pip install -r requirements.txt
```

### Create .env file

Copy `server_py/flatgov/.env-sample` to `server_py/flatgov/.env`, and change the `SECRET_KEY` defined in that file.
Expand Down
22 changes: 20 additions & 2 deletions UPDATES_CELERY.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,25 @@ The fields in the `UscongressUpdateJob`:
+
2. Create `billList.json` and `billsMeta.json`

Once the celery task `update_bill_task` is finished (complete the download bill text and metadata), the other celery task named `bill_data_task` runs.
Once the celery task `update_bill_task` is finished (complete the download bill text and metadata), the other celery task named `bill_data_task` runs. This is triggered in the `save` function of the `uscongress.models.py`:

```python
def save(self, *args, **kwargs):
super().save(*args, **kwargs)
if self.pk and self.data_status == self.SUCCESS and self.bill_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.bill_data_task',
args=(self.pk, ),
queue='bill'
)
if self.pk and self.bill_status == self.SUCCESS and self.meta_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.process_bill_meta_task',
args=(self.pk, ),
queue='bill'
)
...
```

As mentioned above, whenever the `update_bill_task` runs, it creates `UscongressUpdateJob` table record in the database.

Expand All @@ -275,7 +293,7 @@ The task `bill_data_task` creates billList.json and `billsMeta.json` file with t

data.json files at the top level (not the data.json in the text versions) are used to create metadata.

We will proivde an option, which will be the default, to get bill XML from the directory structure that is created by uscongress open source scraper. (In other case, flat structure could be used to get bill XML)
We will provide an option, which will be the default, to get bill XML from the directory structure that is created by uscongress open source scraper. (In other case, flat structure could be used to get bill XML)

3. Process bill meta data.

Expand Down
6 changes: 3 additions & 3 deletions server_py/flatgov/common/bill_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def getBillPath(billnumber: str) -> str:
billMatch = constants.BILL_NUMBER_REGEX_COMPILED.match(billnumber)
if billMatch and billMatch.groupdict():
billMatchGroups = billMatch.groupdict()
xmlDir = getXMLDirByCongress(congress=billMatchGroups.get('congress'))
bill_path = os.path.join(xmlDir, billMatchGroups.get('stage'), billMatchGroups.get('stage') + billMatchGroups.get('number'))
xmlDir = getXMLDirByCongress(congress=billMatchGroups.get('congress', ''))
bill_path = os.path.join(xmlDir, billMatchGroups.get('stage', ''), billMatchGroups.get('stage', '') + billMatchGroups.get('number', ''))
else:
raise Exception(billnumber + ': billnumber is not of the expected form')
return bill_path
Expand Down Expand Up @@ -318,7 +318,7 @@ def filterLatestVersionOnly(billFiles: List[str]):

return billFilesFiltered

CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(115, 118)]
CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(constants.CURRENT_CONGRESS, (constants.CURRENT_CONGRESS-2), -1)]
def processBills(congresses: list=CONGRESS_LIST_DEFAULT, docType: str='dtd', uscongress: bool=False):
number_of_bills_total = 0
for congress in congresses:
Expand Down
1 change: 1 addition & 0 deletions server_py/flatgov/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
BILL_FULL_MAPPING = json.load(f)

PATH_TO_BILLS_META = settings.PATH_TO_BILLS_META
BILLMETA_GO_CMD = settings.BILLMETA_GO_CMD
PATH_TO_BILLS_META_GO = settings.PATH_TO_BILLS_META_GO
PATH_TO_CONGRESSDATA_DIR = settings.CONGRESS_DATA_PATH
PATH_TO_DATA_DIR = settings.PATH_TO_DATA_DIR
Expand Down
8 changes: 4 additions & 4 deletions server_py/flatgov/common/elastic_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def indexBill(bill_path: str=PATH_BILL, index_types: list=['sections']):
billnumber = ''
if billMatch:
billMatchGroup = billMatch.groupdict()
billnumber = billMatchGroup.get('congress') + billMatchGroup.get('stage') + billMatchGroup.get('number')
billversion = billMatchGroup.get('version')
billnumber = billMatchGroup.get('congress', '') + billMatchGroup.get('stage', '') + billMatchGroup.get('number', '')
billversion = billMatchGroup.get('version', '')
sections = billTree.xpath('//section')
headers = billTree.xpath('//header')
from collections import OrderedDict
Expand Down Expand Up @@ -163,14 +163,14 @@ def get_bill_xml(congressDir: str, uscongress: bool = True) -> list:

xml_files = list()
USCONGRESS_XML_FILE = settings.USCONGRESS_XML_FILE
for root, dirs, files in os.walk(congressDir):
for root, _, files in os.walk(congressDir):
if USCONGRESS_XML_FILE in files:
xml_path = os.path.join(root, USCONGRESS_XML_FILE)
xml_files.append(xml_path)
return xml_files


CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(115, 118)]
CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(constants.CURRENT_CONGRESS, (constants.CURRENT_CONGRESS-2), -1)]
def indexBills(congresses: list=CONGRESS_LIST_DEFAULT, docType: str='dtd', uscongress: bool=False, index_types: list=['sections']):
number_of_bills_total = 0
for congress in congresses:
Expand Down
1 change: 1 addition & 0 deletions server_py/flatgov/flatgov/dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

PATH_TO_BILLS_META = os.path.join(BASE_DIR, 'billsMeta.json')
PATH_TO_BILLS_META_GO = os.path.join(BASE_DIR, 'billMetaGo.json')
BILLMETA_GO_CMD = 'billmeta'
PATH_TO_CONGRESSDATA_DIR = CONGRESS_DATA_PATH
PATH_TO_DATA_DIR = os.getenv('PATH_TO_DATA_DIR', os.path.join('/', *"/usr/local/share/xcential/public/data".split('/')))
PATH_TO_CONGRESSDATA_XML_DIR = os.getenv('PATH_TO_CONGRESSDATA_XML_DIR', os.path.join('/', *"/usr/local/share/xcential/public/data/116/dtd".split('/')))
Expand Down
7 changes: 4 additions & 3 deletions server_py/flatgov/uscongress/helper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from typing import Tuple
from django.conf import settings
from elasticsearch import exceptions, Elasticsearch
from common import constants
Expand Down Expand Up @@ -65,12 +66,12 @@ def add_bill_meta(dirName: str, fileName: str):
return bill_congress_type_number, related_dict


def update_bills_meta(bill):
def update_bills_meta(bill: str) -> Tuple[str, dict, bool]:
bill_dir = get_bill_dir(bill)
if not validate_bill_dir(bill_dir, BILL_JSON):
return False
return '', {}, True
bill_congress_type_number, related_dict = add_bill_meta(bill_dir, BILL_JSON)
return bill_congress_type_number, related_dict
return bill_congress_type_number, related_dict, False


def create_es_index(index: str='billsections', body: dict=constants.BILLSECTION_MAPPING):
Expand Down
69 changes: 37 additions & 32 deletions server_py/flatgov/uscongress/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from celery import current_app
from django.db import models


class UscongressUpdateJob(models.Model):
PENDING = 'pending'
SUCCESS = 'success'
Expand All @@ -12,13 +13,27 @@ class UscongressUpdateJob(models.Model):
)

job_id = models.CharField(max_length=50, blank=True, null=True)
fdsys_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
data_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
bill_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
meta_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
related_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
elastic_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
similarity_status = models.CharField(choices=STATUS, default=PENDING, max_length=20)
fdsys_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)
data_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)
bill_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)
meta_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)
related_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)
elastic_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)
similarity_status = models.CharField(choices=STATUS,
default=PENDING,
max_length=20)

saved = models.JSONField(default=list, blank=True, null=True)
skips = models.JSONField(default=list, blank=True, null=True)
Expand All @@ -34,38 +49,28 @@ def __str__(self):
def save(self, *args, **kwargs):
super().save(*args, **kwargs)
if self.pk and self.data_status == self.SUCCESS and self.bill_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.bill_data_task',
args=(self.pk, ),
queue='bill'
)
current_app.send_task('uscongress.tasks.bill_data_task',
args=(self.pk, ),
queue='bill')
if self.pk and self.bill_status == self.SUCCESS and self.meta_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.process_bill_meta_task',
args=(self.pk, ),
queue='bill'
)
current_app.send_task('uscongress.tasks.process_bill_meta_task',
args=(self.pk, ),
queue='bill')

if self.pk and self.meta_status == self.SUCCESS and self.related_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.related_bill_task',
args=(self.pk, ),
queue='bill'
)
current_app.send_task('uscongress.tasks.related_bill_task',
args=(self.pk, ),
queue='bill')

if self.pk and self.related_status == self.SUCCESS and self.elastic_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.elastic_load_task',
args=(self.pk, ),
queue='bill'
)
current_app.send_task('uscongress.tasks.elastic_load_task',
args=(self.pk, ),
queue='bill')

if self.pk and self.elastic_status == self.SUCCESS and self.similarity_status == self.PENDING:
current_app.send_task(
'uscongress.tasks.bill_similarity_task',
args=(self.pk, ),
queue='bill'
)
current_app.send_task('uscongress.tasks.bill_similarity_task',
args=(self.pk, ),
queue='bill')

@property
def get_saved_bill_list(self):
Expand Down
44 changes: 35 additions & 9 deletions server_py/flatgov/uscongress/tasks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import subprocess
import shutil
from celery import shared_task, current_app
from uscongress.models import UscongressUpdateJob
from uscongress.handlers import govinfo, bills
Expand All @@ -7,7 +10,7 @@
update_bills_meta,
es_similarity_bill,
)
from common.billdata import saveBillsMeta
from common.billdata import saveBillsMeta, saveBillsMetaToDb
from common.process_bill_meta import makeAndSaveTitlesIndex
from common.elastic_load import (
refreshIndices,
Expand All @@ -16,6 +19,9 @@
getInnerResults,
)

from django.conf import settings
from common.constants import BILLMETA_GO_CMD, PATH_TO_BILLS_META_GO

GOVINFO_OPTIONS = {
'collections': 'BILLS',
'bulkdata': 'BILLSTATUS',
Expand All @@ -30,13 +36,17 @@
def update_bill_task(self):
history = UscongressUpdateJob.objects.create(job_id=self.request.id)
try:
# Downloads files from Govinfo
# The govinfo.py file is copied from the uscongress repository
govinfo.run(GOVINFO_OPTIONS)
history.fdsys_status = UscongressUpdateJob.SUCCESS
history.save(update_fields=['fdsys_status'])
except Exception as e:
history.fdsys_status = UscongressUpdateJob.FAILED
history.save(update_fields=['fdsys_status'])
try:
# Creates data.json from the downloaded files
# The bills.py file is copied from the uscongress repository
processed = bills.run(BILLS_OPTIONS)
history.data_status = UscongressUpdateJob.SUCCESS
history.saved = processed.get('saved')
Expand All @@ -47,16 +57,24 @@ def update_bill_task(self):
history.save(update_fields=['data_status'])
return history.id

def update_bills_meta_go():
subprocess.run([BILLMETA_GO_CMD, '-p', settings.BASE_DIR])
saveBillsMetaToDb()

@shared_task(bind=True)
def bill_data_task(self, pk):
bills_meta = dict()
history = UscongressUpdateJob.objects.get(pk=pk)
try:
for bill_id in history.saved:
bill_congress_type_number, related_dict = update_bills_meta(bill_id)
bills_meta[bill_congress_type_number] = related_dict
saveBillsMeta(bills_meta)
if shutil.which(BILLMETA_GO_CMD) is not None:
update_bills_meta_go()
else:
for bill_id in history.saved:
bill_congress_type_number, related_dict, err = update_bills_meta(bill_id)
if err:
continue
bills_meta[bill_congress_type_number] = related_dict
saveBillsMeta(bills_meta)
history.bill_status = UscongressUpdateJob.SUCCESS
history.save(update_fields=['bill_status'])
except Exception as e:
Expand All @@ -68,7 +86,11 @@ def bill_data_task(self, pk):
def process_bill_meta_task(self, pk):
history = UscongressUpdateJob.objects.get(pk=pk)
try:
makeAndSaveTitlesIndex()
# The Go version of update_bills_meta includes this task
if shutil.which(BILLMETA_GO_CMD) is not None:
pass
else:
makeAndSaveTitlesIndex()
history.meta_status = UscongressUpdateJob.SUCCESS
history.save(update_fields=['meta_status'])
except Exception as e:
Expand All @@ -81,7 +103,11 @@ def related_bill_task(self, pk):
from common.relatedBills import makeAndSaveRelatedBills
history = UscongressUpdateJob.objects.get(pk=pk)
try:
makeAndSaveRelatedBills()
# The Go version of update_bills_meta includes this task
if not os.path.exists(PATH_TO_BILLS_META_GO):
makeAndSaveRelatedBills()
else:
pass
history.related_status = UscongressUpdateJob.SUCCESS
history.save(update_fields=['related_status'])
except Exception as e:
Expand All @@ -91,8 +117,8 @@ def related_bill_task(self, pk):

@shared_task(bind=True)
def elastic_load_task(self, pk):
history = UscongressUpdateJob.objects.get(pk=pk)
try:
history = UscongressUpdateJob.objects.get(pk=pk)
created = create_es_index()
for bill_id in history.saved:
res = es_index_bill(bill_id)
Expand All @@ -109,8 +135,8 @@ def elastic_load_task(self, pk):

@shared_task(bind=True)
def bill_similarity_task(self, pk):
history = UscongressUpdateJob.objects.get(pk=pk)
try:
history = UscongressUpdateJob.objects.get(pk=pk)
for bill_id in history.saved:
res = es_similarity_bill(bill_id)
history.similarity_status = UscongressUpdateJob.SUCCESS
Expand Down

0 comments on commit b1dd627

Please sign in to comment.