From b1dd62701cea2360ece4462fe129bb9449ba9332 Mon Sep 17 00:00:00 2001 From: Ari Hershowitz Date: Thu, 22 Apr 2021 20:25:14 -0700 Subject: [PATCH] Docs/celery update (#325) * Update celery tasks to use Go billmeta when available * Document update to celery * Index and calculate bill similarity from current congress --- README.adoc | 28 +++++++++ UPDATES_CELERY.adoc | 22 ++++++- server_py/flatgov/common/bill_similarity.py | 6 +- server_py/flatgov/common/constants.py | 1 + server_py/flatgov/common/elastic_load.py | 8 +-- server_py/flatgov/flatgov/dev.py | 1 + server_py/flatgov/uscongress/helper.py | 7 ++- server_py/flatgov/uscongress/models.py | 69 +++++++++++---------- server_py/flatgov/uscongress/tasks.py | 44 ++++++++++--- 9 files changed, 133 insertions(+), 53 deletions(-) diff --git a/README.adoc b/README.adoc index 7b87698..e4ad66a 100644 --- a/README.adoc +++ b/README.adoc @@ -60,6 +60,34 @@ $ cd /path/to/FlatGov/server_py $ pip install -r requirements.txt ``` +#### Installing pypy as virtualenv + +The application has been tested and works with pypy on ubuntu: + +1. Install pypy as a pyenv virtualenv, for example + +``` +pyenv install pypy3.7-7.3.4 +pyenv virtualenv pypy3.7-7.3.4 pypy37flat +pyenv activate pypy37flat +``` + +2. Upgrade pip, if appropriate + +`/home/ubuntu/.pyenv/versions/pypy3.7-7.3.4/envs/pypy37flat/bin/pypy3 -m pip install --upgrade pip` + +3. It may be necessary to install C libraries to build lxml + +`sudo apt-get install libxml2-dev libxslt-dev python-dev` + + +4. Install requirements + +``` +cd /path/to/FlatGov/server_py +pip install -r requirements.txt +``` + ### Create .env file Copy `server_py/flatgov/.env-sample` to `server_py/flatgov/.env`, and change the `SECRET_KEY` defined in that file. diff --git a/UPDATES_CELERY.adoc b/UPDATES_CELERY.adoc index c60206c..94dbf35 100644 --- a/UPDATES_CELERY.adoc +++ b/UPDATES_CELERY.adoc @@ -265,7 +265,25 @@ The fields in the `UscongressUpdateJob`: + 2. Create `billList.json` and `billsMeta.json` -Once the celery task `update_bill_task` is finished (complete the download bill text and metadata), the other celery task named `bill_data_task` runs. +Once the celery task `update_bill_task` is finished (complete the download bill text and metadata), the other celery task named `bill_data_task` runs. This is triggered in the `save` function of the `uscongress.models.py`: + +```python + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + if self.pk and self.data_status == self.SUCCESS and self.bill_status == self.PENDING: + current_app.send_task( + 'uscongress.tasks.bill_data_task', + args=(self.pk, ), + queue='bill' + ) + if self.pk and self.bill_status == self.SUCCESS and self.meta_status == self.PENDING: + current_app.send_task( + 'uscongress.tasks.process_bill_meta_task', + args=(self.pk, ), + queue='bill' + ) + ... +``` As mentioned above, whenever the `update_bill_task` runs, it creates `UscongressUpdateJob` table record in the database. @@ -275,7 +293,7 @@ The task `bill_data_task` creates billList.json and `billsMeta.json` file with t data.json files at the top level (not the data.json in the text versions) are used to create metadata. -We will proivde an option, which will be the default, to get bill XML from the directory structure that is created by uscongress open source scraper. (In other case, flat structure could be used to get bill XML) +We will provide an option, which will be the default, to get bill XML from the directory structure that is created by uscongress open source scraper. (In other case, flat structure could be used to get bill XML) 3. Process bill meta data. diff --git a/server_py/flatgov/common/bill_similarity.py b/server_py/flatgov/common/bill_similarity.py index 45e1e90..b9b14d0 100644 --- a/server_py/flatgov/common/bill_similarity.py +++ b/server_py/flatgov/common/bill_similarity.py @@ -101,8 +101,8 @@ def getBillPath(billnumber: str) -> str: billMatch = constants.BILL_NUMBER_REGEX_COMPILED.match(billnumber) if billMatch and billMatch.groupdict(): billMatchGroups = billMatch.groupdict() - xmlDir = getXMLDirByCongress(congress=billMatchGroups.get('congress')) - bill_path = os.path.join(xmlDir, billMatchGroups.get('stage'), billMatchGroups.get('stage') + billMatchGroups.get('number')) + xmlDir = getXMLDirByCongress(congress=billMatchGroups.get('congress', '')) + bill_path = os.path.join(xmlDir, billMatchGroups.get('stage', ''), billMatchGroups.get('stage', '') + billMatchGroups.get('number', '')) else: raise Exception(billnumber + ': billnumber is not of the expected form') return bill_path @@ -318,7 +318,7 @@ def filterLatestVersionOnly(billFiles: List[str]): return billFilesFiltered -CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(115, 118)] +CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(constants.CURRENT_CONGRESS, (constants.CURRENT_CONGRESS-2), -1)] def processBills(congresses: list=CONGRESS_LIST_DEFAULT, docType: str='dtd', uscongress: bool=False): number_of_bills_total = 0 for congress in congresses: diff --git a/server_py/flatgov/common/constants.py b/server_py/flatgov/common/constants.py index a5410ec..5c385a3 100644 --- a/server_py/flatgov/common/constants.py +++ b/server_py/flatgov/common/constants.py @@ -35,6 +35,7 @@ BILL_FULL_MAPPING = json.load(f) PATH_TO_BILLS_META = settings.PATH_TO_BILLS_META +BILLMETA_GO_CMD = settings.BILLMETA_GO_CMD PATH_TO_BILLS_META_GO = settings.PATH_TO_BILLS_META_GO PATH_TO_CONGRESSDATA_DIR = settings.CONGRESS_DATA_PATH PATH_TO_DATA_DIR = settings.PATH_TO_DATA_DIR diff --git a/server_py/flatgov/common/elastic_load.py b/server_py/flatgov/common/elastic_load.py index 4db2ebe..e911bcd 100644 --- a/server_py/flatgov/common/elastic_load.py +++ b/server_py/flatgov/common/elastic_load.py @@ -92,8 +92,8 @@ def indexBill(bill_path: str=PATH_BILL, index_types: list=['sections']): billnumber = '' if billMatch: billMatchGroup = billMatch.groupdict() - billnumber = billMatchGroup.get('congress') + billMatchGroup.get('stage') + billMatchGroup.get('number') - billversion = billMatchGroup.get('version') + billnumber = billMatchGroup.get('congress', '') + billMatchGroup.get('stage', '') + billMatchGroup.get('number', '') + billversion = billMatchGroup.get('version', '') sections = billTree.xpath('//section') headers = billTree.xpath('//header') from collections import OrderedDict @@ -163,14 +163,14 @@ def get_bill_xml(congressDir: str, uscongress: bool = True) -> list: xml_files = list() USCONGRESS_XML_FILE = settings.USCONGRESS_XML_FILE - for root, dirs, files in os.walk(congressDir): + for root, _, files in os.walk(congressDir): if USCONGRESS_XML_FILE in files: xml_path = os.path.join(root, USCONGRESS_XML_FILE) xml_files.append(xml_path) return xml_files -CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(115, 118)] +CONGRESS_LIST_DEFAULT = [str(congressNum) for congressNum in range(constants.CURRENT_CONGRESS, (constants.CURRENT_CONGRESS-2), -1)] def indexBills(congresses: list=CONGRESS_LIST_DEFAULT, docType: str='dtd', uscongress: bool=False, index_types: list=['sections']): number_of_bills_total = 0 for congress in congresses: diff --git a/server_py/flatgov/flatgov/dev.py b/server_py/flatgov/flatgov/dev.py index fbd00ed..2c58bb0 100644 --- a/server_py/flatgov/flatgov/dev.py +++ b/server_py/flatgov/flatgov/dev.py @@ -41,6 +41,7 @@ PATH_TO_BILLS_META = os.path.join(BASE_DIR, 'billsMeta.json') PATH_TO_BILLS_META_GO = os.path.join(BASE_DIR, 'billMetaGo.json') +BILLMETA_GO_CMD = 'billmeta' PATH_TO_CONGRESSDATA_DIR = CONGRESS_DATA_PATH PATH_TO_DATA_DIR = os.getenv('PATH_TO_DATA_DIR', os.path.join('/', *"/usr/local/share/xcential/public/data".split('/'))) PATH_TO_CONGRESSDATA_XML_DIR = os.getenv('PATH_TO_CONGRESSDATA_XML_DIR', os.path.join('/', *"/usr/local/share/xcential/public/data/116/dtd".split('/'))) diff --git a/server_py/flatgov/uscongress/helper.py b/server_py/flatgov/uscongress/helper.py index 4e26217..a1f2596 100644 --- a/server_py/flatgov/uscongress/helper.py +++ b/server_py/flatgov/uscongress/helper.py @@ -1,4 +1,5 @@ import os +from typing import Tuple from django.conf import settings from elasticsearch import exceptions, Elasticsearch from common import constants @@ -65,12 +66,12 @@ def add_bill_meta(dirName: str, fileName: str): return bill_congress_type_number, related_dict -def update_bills_meta(bill): +def update_bills_meta(bill: str) -> Tuple[str, dict, bool]: bill_dir = get_bill_dir(bill) if not validate_bill_dir(bill_dir, BILL_JSON): - return False + return '', {}, True bill_congress_type_number, related_dict = add_bill_meta(bill_dir, BILL_JSON) - return bill_congress_type_number, related_dict + return bill_congress_type_number, related_dict, False def create_es_index(index: str='billsections', body: dict=constants.BILLSECTION_MAPPING): diff --git a/server_py/flatgov/uscongress/models.py b/server_py/flatgov/uscongress/models.py index 68d80f4..19d2f2c 100644 --- a/server_py/flatgov/uscongress/models.py +++ b/server_py/flatgov/uscongress/models.py @@ -1,6 +1,7 @@ from celery import current_app from django.db import models + class UscongressUpdateJob(models.Model): PENDING = 'pending' SUCCESS = 'success' @@ -12,13 +13,27 @@ class UscongressUpdateJob(models.Model): ) job_id = models.CharField(max_length=50, blank=True, null=True) - fdsys_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) - data_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) - bill_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) - meta_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) - related_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) - elastic_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) - similarity_status = models.CharField(choices=STATUS, default=PENDING, max_length=20) + fdsys_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) + data_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) + bill_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) + meta_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) + related_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) + elastic_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) + similarity_status = models.CharField(choices=STATUS, + default=PENDING, + max_length=20) saved = models.JSONField(default=list, blank=True, null=True) skips = models.JSONField(default=list, blank=True, null=True) @@ -34,38 +49,28 @@ def __str__(self): def save(self, *args, **kwargs): super().save(*args, **kwargs) if self.pk and self.data_status == self.SUCCESS and self.bill_status == self.PENDING: - current_app.send_task( - 'uscongress.tasks.bill_data_task', - args=(self.pk, ), - queue='bill' - ) + current_app.send_task('uscongress.tasks.bill_data_task', + args=(self.pk, ), + queue='bill') if self.pk and self.bill_status == self.SUCCESS and self.meta_status == self.PENDING: - current_app.send_task( - 'uscongress.tasks.process_bill_meta_task', - args=(self.pk, ), - queue='bill' - ) + current_app.send_task('uscongress.tasks.process_bill_meta_task', + args=(self.pk, ), + queue='bill') if self.pk and self.meta_status == self.SUCCESS and self.related_status == self.PENDING: - current_app.send_task( - 'uscongress.tasks.related_bill_task', - args=(self.pk, ), - queue='bill' - ) + current_app.send_task('uscongress.tasks.related_bill_task', + args=(self.pk, ), + queue='bill') if self.pk and self.related_status == self.SUCCESS and self.elastic_status == self.PENDING: - current_app.send_task( - 'uscongress.tasks.elastic_load_task', - args=(self.pk, ), - queue='bill' - ) + current_app.send_task('uscongress.tasks.elastic_load_task', + args=(self.pk, ), + queue='bill') if self.pk and self.elastic_status == self.SUCCESS and self.similarity_status == self.PENDING: - current_app.send_task( - 'uscongress.tasks.bill_similarity_task', - args=(self.pk, ), - queue='bill' - ) + current_app.send_task('uscongress.tasks.bill_similarity_task', + args=(self.pk, ), + queue='bill') @property def get_saved_bill_list(self): diff --git a/server_py/flatgov/uscongress/tasks.py b/server_py/flatgov/uscongress/tasks.py index 78c8df7..a219bf0 100644 --- a/server_py/flatgov/uscongress/tasks.py +++ b/server_py/flatgov/uscongress/tasks.py @@ -1,3 +1,6 @@ +import os +import subprocess +import shutil from celery import shared_task, current_app from uscongress.models import UscongressUpdateJob from uscongress.handlers import govinfo, bills @@ -7,7 +10,7 @@ update_bills_meta, es_similarity_bill, ) -from common.billdata import saveBillsMeta +from common.billdata import saveBillsMeta, saveBillsMetaToDb from common.process_bill_meta import makeAndSaveTitlesIndex from common.elastic_load import ( refreshIndices, @@ -16,6 +19,9 @@ getInnerResults, ) +from django.conf import settings +from common.constants import BILLMETA_GO_CMD, PATH_TO_BILLS_META_GO + GOVINFO_OPTIONS = { 'collections': 'BILLS', 'bulkdata': 'BILLSTATUS', @@ -30,6 +36,8 @@ def update_bill_task(self): history = UscongressUpdateJob.objects.create(job_id=self.request.id) try: + # Downloads files from Govinfo + # The govinfo.py file is copied from the uscongress repository govinfo.run(GOVINFO_OPTIONS) history.fdsys_status = UscongressUpdateJob.SUCCESS history.save(update_fields=['fdsys_status']) @@ -37,6 +45,8 @@ def update_bill_task(self): history.fdsys_status = UscongressUpdateJob.FAILED history.save(update_fields=['fdsys_status']) try: + # Creates data.json from the downloaded files + # The bills.py file is copied from the uscongress repository processed = bills.run(BILLS_OPTIONS) history.data_status = UscongressUpdateJob.SUCCESS history.saved = processed.get('saved') @@ -47,16 +57,24 @@ def update_bill_task(self): history.save(update_fields=['data_status']) return history.id +def update_bills_meta_go(): + subprocess.run([BILLMETA_GO_CMD, '-p', settings.BASE_DIR]) + saveBillsMetaToDb() @shared_task(bind=True) def bill_data_task(self, pk): bills_meta = dict() history = UscongressUpdateJob.objects.get(pk=pk) try: - for bill_id in history.saved: - bill_congress_type_number, related_dict = update_bills_meta(bill_id) - bills_meta[bill_congress_type_number] = related_dict - saveBillsMeta(bills_meta) + if shutil.which(BILLMETA_GO_CMD) is not None: + update_bills_meta_go() + else: + for bill_id in history.saved: + bill_congress_type_number, related_dict, err = update_bills_meta(bill_id) + if err: + continue + bills_meta[bill_congress_type_number] = related_dict + saveBillsMeta(bills_meta) history.bill_status = UscongressUpdateJob.SUCCESS history.save(update_fields=['bill_status']) except Exception as e: @@ -68,7 +86,11 @@ def bill_data_task(self, pk): def process_bill_meta_task(self, pk): history = UscongressUpdateJob.objects.get(pk=pk) try: - makeAndSaveTitlesIndex() + # The Go version of update_bills_meta includes this task + if shutil.which(BILLMETA_GO_CMD) is not None: + pass + else: + makeAndSaveTitlesIndex() history.meta_status = UscongressUpdateJob.SUCCESS history.save(update_fields=['meta_status']) except Exception as e: @@ -81,7 +103,11 @@ def related_bill_task(self, pk): from common.relatedBills import makeAndSaveRelatedBills history = UscongressUpdateJob.objects.get(pk=pk) try: - makeAndSaveRelatedBills() + # The Go version of update_bills_meta includes this task + if not os.path.exists(PATH_TO_BILLS_META_GO): + makeAndSaveRelatedBills() + else: + pass history.related_status = UscongressUpdateJob.SUCCESS history.save(update_fields=['related_status']) except Exception as e: @@ -91,8 +117,8 @@ def related_bill_task(self, pk): @shared_task(bind=True) def elastic_load_task(self, pk): + history = UscongressUpdateJob.objects.get(pk=pk) try: - history = UscongressUpdateJob.objects.get(pk=pk) created = create_es_index() for bill_id in history.saved: res = es_index_bill(bill_id) @@ -109,8 +135,8 @@ def elastic_load_task(self, pk): @shared_task(bind=True) def bill_similarity_task(self, pk): + history = UscongressUpdateJob.objects.get(pk=pk) try: - history = UscongressUpdateJob.objects.get(pk=pk) for bill_id in history.saved: res = es_similarity_bill(bill_id) history.similarity_status = UscongressUpdateJob.SUCCESS