From c201952ad7fca49c1a6958a1d9f3e83ffd031674 Mon Sep 17 00:00:00 2001 From: Tyler Danstrom Date: Wed, 16 May 2018 13:18:40 -0500 Subject: [PATCH] added OLERecordFinder class and corresponding test of functionality --- README.md | 468 ++++++++++++++++---------------- marcextraction/interfaces.py | 499 +++++++++++++++++++---------------- requirements.txt | 1 + tests/test_spec.py | 246 +++++++++-------- 4 files changed, 640 insertions(+), 574 deletions(-) diff --git a/README.md b/README.md index 2989e9a..209e489 100644 --- a/README.md +++ b/README.md @@ -1,234 +1,234 @@ - -# marcExtraction - -This is a Python library that allows a consumer to extract MARC records from - -1. a file exported to disk -1. a VuFind API - -# Quick start - -1. ```git cone git@github.com:uchicago-library/extract_marc_from_vufind``` -1. ```cd extract_marc_from_vufind``` -1. ```python -m venv venv``` -1. ```source venv/bin/activate``` -1. ```pip install -r requirements.txt``` -1. ```python setup.py develop``` - -And you are ready to start hacking new functionality to the code base. Don't forget to follow good branching and open source citizen etiquette when you're doing it though! - -# How to Use the Library - -To see the field and subfield labels to use when searching for a particular MARC field, follow the instructions below. - -```python ->>> from marcextraction import MarcFieldLookup ->>> mf = MarcFieldLookup ->>> mf.show_valid_lookups() -MARC Field Number: Main Entry - Personal name - Subfield: Personal name - Subfield: Numeration - Subfield: Title and words associated with name - Subfield: Dates associated with name - Subfield: Relator term - Subfield: Date of a work - Subfield: Miscellaneous information - Subfield: Attribution qualifier - Subfield: Form subheading - Subfield: Language of a work - Subfield: Number of part/section of a work - Subfield: Name of part/section work - Subfield: Fuller form of name - Subfield: Title of a work - Subfield: Affiliation - Subfield: Authority record control number or standard number - Subfield: REal World Object URI - Subfield: Relationship - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Main Entry - Corporate Name - Subfield: Corporate name or jurisdiction name as entry element - Subfield: Subordinate unit - Subfield: Location of meeting - Subfield: Date of meeting or treaty signing - Subfield: Relator term - Subfield: Date of a work - Subfield: Miscellaneous information - Subfield: Language of a work - Subfield: Number of part/section/meeting - Subfield: Name of part/section of a work - Subfield: Title of a work - Subfield: Affiliation - Subfield: Authority record control number or standard number - Subfield: Real World Object URI - Subfield: Linkage - Subfield: Field link and sequence identifier -MARC Field Number: Main Entry - Meeting Name - Subfield: Meeting name or jurisdiction name as entry element - Subfield: Location of meeting - Subfield: Date of meeting or treaty signing - Subfield: Subordinate unit - Subfield: Date of a work - Subfield: Miscellaneous information - Subfield: Form subheading - Subfield: Language of a work - Subfield: Number of part/section/meeting - Subfield: Name of a part/section of a work - Subfield: Name of meeting following jurisdiction name entry element - Subfield: Title of a work - Subfield: Affiliation - Subfield: Authority record control number or standard number - Subfield: Real World Object URI - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Main Entry - Uniform Title - Subfield: Uniform title - Subfield: Date of treaty signing - Subfield: Date of work - Subfield: Miscellaneous information - Subfield: Medium - Subfield: Form subheading - Subfield: Language of a work - Subfield: Medium of performance for music - Subfield: Number of part/section of a work - Subfield: Arranged statement for music - Subfield: Name of part/section of a work - Subfield: Key for music - Subfield: Version - Subfield: Title of a work - Subfield: Authority record control number or standard number - Subfield: Real World Object URI - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Abbreviated Title - Subfield: Abbreviated title - Subfield: Qualifying information -MARC Field Number: Key Title - Subfield: Key title - Subfield: Qualifying information - Subfield: Linkage - Subfield: Field lnk and sequence number -MARC Field Number: Uniform Title - Subfield: Uniform title - Subfield: Date of treaty signing - Subfield: Date of work - Subfield: Miscellaneous information - Subfield: Medium - Subfield: Form subheading - Subfield: Language of a work - Subfield: Medium of performance for music - Subfield: Number of part/section of a work - Subfield: Arranged statement for music - Subfield: Name of part/section of a work - Subfield: Key for music - Subfield: Version - Subfield: Authority record control number or standard number - Subfield: Real World Object URI - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Translation of Titlte by Cataloging Agency - Subfield: Title - Subfield: Remainder of title - Subfield: Statement of responsibility - Subfield: Medium - Subfield: Number of part/section of a work - Subfield: Name of part/section of a work - Subfield: Language of code translated title - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Collection Uniform Title - Subfield: Uniform title - Subfield: Date of treaty signing - Subfield: Date of a work - Subfield: Miscellaneous information - Subfield: Medium - Subfield: Form subheading - Subfield: Language of a work - Subfield: Medium of performance music - Subfield: Number of part/section of a work - Subfield: Arrange statement for music - Subfield: Name of part/section of a work - Subfield: Key for music - Subfield: Version - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Title Statement - Subfield: Title - Subfield: Remainder of title - Subfield: Statement of responsibility - Subfield: Inclusive dates - Subfield: Buk dates - Subfield: Medium - Subfield: Form - Subfield: Number of part/section of a work - Subfield: Name of part/section of a work - Subfield: Version - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Varying Form of Title - Subfield: Title proper/short title - Subfield: Remainder of title - Subfield: Date or sequential designation - Subfield: Miscellaneous information - Subfield: Medium - Subfield: Display text - Subfield: Number of part/section of a work - Subfield: Name of part/section of a work - Subfield: Institution to which field applies - Subfield: Linkage - Subfield: Field link and sequence number -MARC Field Number: Former Title -``` - -In order to get the index field name for Uniform Title: - -```python ->>> from marcextraction.lookup import MarcFieldLookup ->>> mf = MarcFieldLookup("Main Entry - Uniform Title", "Uniform title") ->>> mf.show_index_field() -130a -``` - -If on the other hand you are looking to find some particular subset of a bunch of MARC records that you have on-disk, you can do something like the following. - -```python ->>> from marcextraction.interfaces import OnDiskSearcher ->>> searcher = OnDiskSearcher(location='/path/to/a/bunch/of/marc/record/files') ->>> results = searcher.search('banana', 'Title Statement', 'Title') -``` - -This example will do the following - -1. Instantiate an instance of OnDiskSearcher with a list of valid MARC records at /path/to/a/bunch/of/marc/record/files -1. Perform a search on the MARC records for any record with banana in MARC field '245', subfield 'a'. - -Still, you might be in an organization using OLE. In which case, you could do something like this. - -```python ->>> from marcextraction.interfaces import VuFindSearcher ->>> searcher = SolrIndexSearcher('http://your.domain/path/to/index', create_ole_index_field, create_ole_query) ->>> results = searcher.search('banana', 'Title Statement', 'Title') -``` -This example does the same thing as the earlier example except this time it's searching a SOLR index. - -If you want get the bib numbers for a particular set of results from am OLE index search, you should do the following. - -```python ->>> from marcextraction.interfaces import VuFindSearcher ->>> searcher = SolrIndexSearcher('http://your.domain/path/to/index', create_ole_index_field, create_ole_query) ->>> results = searcher.search('banana', 'Title Statement', 'Title') ->>> results = find_ole_bib_numbers(results) -``` - -## Internal Project Management - -- [Brainstorming document](https://docs.google.com/document/d/18leMBOiPCnQujR2gOBjDCPajI7-t_AzWJxglH34QjFw/edit?usp=sharing) - -## Additional Links - -- [MARC21 Bibliographic Data]()https://www.loc.gov/marc/bibliographic/) for the field and subfield labels to use when looking up a particular field -- [readthedocs documentation](http://extract-marc-from-vufind.readthedocs.io/en/latest/index.html) - -## Author - -- verbalhanglider (tdanstrom@uchicago.edu) + +# marcExtraction + +This is a Python library that allows a consumer to extract MARC records from + +1. a file exported to disk +1. a VuFind API + +# Quick start + +1. ```git cone git@github.com:uchicago-library/extract_marc_from_vufind``` +1. ```cd extract_marc_from_vufind``` +1. ```python -m venv venv``` +1. ```source venv/bin/activate``` +1. ```pip install -r requirements.txt``` +1. ```python setup.py develop``` + +And you are ready to start hacking new functionality to the code base. Don't forget to follow good branching and open source citizen etiquette when you're doing it though! + +# How to Use the Library + +To see the field and subfield labels to use when searching for a particular MARC field, follow the instructions below. + +```python +>>> from marcextraction import MarcFieldLookup +>>> mf = MarcFieldLookup +>>> mf.show_valid_lookups() +MARC Field Number: Main Entry - Personal name + Subfield: Personal name + Subfield: Numeration + Subfield: Title and words associated with name + Subfield: Dates associated with name + Subfield: Relator term + Subfield: Date of a work + Subfield: Miscellaneous information + Subfield: Attribution qualifier + Subfield: Form subheading + Subfield: Language of a work + Subfield: Number of part/section of a work + Subfield: Name of part/section work + Subfield: Fuller form of name + Subfield: Title of a work + Subfield: Affiliation + Subfield: Authority record control number or standard number + Subfield: REal World Object URI + Subfield: Relationship + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Main Entry - Corporate Name + Subfield: Corporate name or jurisdiction name as entry element + Subfield: Subordinate unit + Subfield: Location of meeting + Subfield: Date of meeting or treaty signing + Subfield: Relator term + Subfield: Date of a work + Subfield: Miscellaneous information + Subfield: Language of a work + Subfield: Number of part/section/meeting + Subfield: Name of part/section of a work + Subfield: Title of a work + Subfield: Affiliation + Subfield: Authority record control number or standard number + Subfield: Real World Object URI + Subfield: Linkage + Subfield: Field link and sequence identifier +MARC Field Number: Main Entry - Meeting Name + Subfield: Meeting name or jurisdiction name as entry element + Subfield: Location of meeting + Subfield: Date of meeting or treaty signing + Subfield: Subordinate unit + Subfield: Date of a work + Subfield: Miscellaneous information + Subfield: Form subheading + Subfield: Language of a work + Subfield: Number of part/section/meeting + Subfield: Name of a part/section of a work + Subfield: Name of meeting following jurisdiction name entry element + Subfield: Title of a work + Subfield: Affiliation + Subfield: Authority record control number or standard number + Subfield: Real World Object URI + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Main Entry - Uniform Title + Subfield: Uniform title + Subfield: Date of treaty signing + Subfield: Date of work + Subfield: Miscellaneous information + Subfield: Medium + Subfield: Form subheading + Subfield: Language of a work + Subfield: Medium of performance for music + Subfield: Number of part/section of a work + Subfield: Arranged statement for music + Subfield: Name of part/section of a work + Subfield: Key for music + Subfield: Version + Subfield: Title of a work + Subfield: Authority record control number or standard number + Subfield: Real World Object URI + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Abbreviated Title + Subfield: Abbreviated title + Subfield: Qualifying information +MARC Field Number: Key Title + Subfield: Key title + Subfield: Qualifying information + Subfield: Linkage + Subfield: Field lnk and sequence number +MARC Field Number: Uniform Title + Subfield: Uniform title + Subfield: Date of treaty signing + Subfield: Date of work + Subfield: Miscellaneous information + Subfield: Medium + Subfield: Form subheading + Subfield: Language of a work + Subfield: Medium of performance for music + Subfield: Number of part/section of a work + Subfield: Arranged statement for music + Subfield: Name of part/section of a work + Subfield: Key for music + Subfield: Version + Subfield: Authority record control number or standard number + Subfield: Real World Object URI + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Translation of Titlte by Cataloging Agency + Subfield: Title + Subfield: Remainder of title + Subfield: Statement of responsibility + Subfield: Medium + Subfield: Number of part/section of a work + Subfield: Name of part/section of a work + Subfield: Language of code translated title + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Collection Uniform Title + Subfield: Uniform title + Subfield: Date of treaty signing + Subfield: Date of a work + Subfield: Miscellaneous information + Subfield: Medium + Subfield: Form subheading + Subfield: Language of a work + Subfield: Medium of performance music + Subfield: Number of part/section of a work + Subfield: Arrange statement for music + Subfield: Name of part/section of a work + Subfield: Key for music + Subfield: Version + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Title Statement + Subfield: Title + Subfield: Remainder of title + Subfield: Statement of responsibility + Subfield: Inclusive dates + Subfield: Buk dates + Subfield: Medium + Subfield: Form + Subfield: Number of part/section of a work + Subfield: Name of part/section of a work + Subfield: Version + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Varying Form of Title + Subfield: Title proper/short title + Subfield: Remainder of title + Subfield: Date or sequential designation + Subfield: Miscellaneous information + Subfield: Medium + Subfield: Display text + Subfield: Number of part/section of a work + Subfield: Name of part/section of a work + Subfield: Institution to which field applies + Subfield: Linkage + Subfield: Field link and sequence number +MARC Field Number: Former Title +``` + +In order to get the index field name for Uniform Title: + +```python +>>> from marcextraction.lookup import MarcFieldLookup +>>> mf = MarcFieldLookup("Main Entry - Uniform Title", "Uniform title") +>>> mf.show_index_field() +130a +``` + +If on the other hand you are looking to find some particular subset of a bunch of MARC records that you have on-disk, you can do something like the following. + +```python +>>> from marcextraction.interfaces import OnDiskSearcher +>>> searcher = OnDiskSearcher(location='/path/to/a/bunch/of/marc/record/files') +>>> results = searcher.search('banana', 'Title Statement', 'Title') +``` + +This example will do the following + +1. Instantiate an instance of OnDiskSearcher with a list of valid MARC records at /path/to/a/bunch/of/marc/record/files +1. Perform a search on the MARC records for any record with banana in MARC field '245', subfield 'a'. + +Still, you might be in an organization using OLE. In which case, you could do something like this. + +```python +>>> from marcextraction.interfaces import VuFindSearcher +>>> searcher = SolrIndexSearcher('http://your.domain/path/to/index', create_ole_index_field, create_ole_query) +>>> results = searcher.search('banana', 'Title Statement', 'Title') +``` +This example does the same thing as the earlier example except this time it's searching a SOLR index. + +If you want get the bib numbers for a particular set of results from am OLE index search, you should do the following. + +```python +>>> from marcextraction.interfaces import VuFindSearcher +>>> searcher = SolrIndexSearcher('http://your.domain/path/to/index', create_ole_index_field, create_ole_query) +>>> results = searcher.search('banana', 'Title Statement', 'Title') +>>> results = find_ole_bib_numbers(results) +``` + +## Internal Project Management + +- [Brainstorming document](https://docs.google.com/document/d/18leMBOiPCnQujR2gOBjDCPajI7-t_AzWJxglH34QjFw/edit?usp=sharing) + +## Additional Links + +- [MARC21 Bibliographic Data]()https://www.loc.gov/marc/bibliographic/) for the field and subfield labels to use when looking up a particular field +- [readthedocs documentation](http://extract-marc-from-vufind.readthedocs.io/en/latest/index.html) + +## Author + +- verbalhanglider (tdanstrom@uchicago.edu) diff --git a/marcextraction/interfaces.py b/marcextraction/interfaces.py index 20da05a..a60efb0 100644 --- a/marcextraction/interfaces.py +++ b/marcextraction/interfaces.py @@ -1,224 +1,275 @@ -"""the interface classes to allow for building a list of records and/or searching for relevant records -""" - -from abc import ABCMeta, abstractclassmethod, abstractmethod, abstractproperty -from io import BytesIO -from os import scandir, stat -from os.path import exists, isfile, isdir -from pymarc import MARCReader -from pymarc.exceptions import RecordLengthInvalid -from pysolr import Solr -from requests import get -from requests.exceptions import ConnectionError - -from .constants import LOOKUP -from .lookup import MarcFieldLookup -from .utils import create_ole_index_field, create_ole_query - -class SolrIndexSearcher: - """a class to be used to search a Solr index for a query - """ - def __init__(self, index_url, index_type): - """initializes an instance of the class SolrIndexSearcher - - Args: - index_url (str): the URL to the SOLR index that will be queried. - index_type (str): a flag indicating which type of index is being used. - Needed for being able to generate the correct index field name - - """ - try: # have to check if index_url inputted is a resolveable URL - get(index_url, "head") - self.solr_index = Solr(index_url) - self.index_url = self.solr_index.url - self.field_creator = self._build_field_definer(index_type) - self.query_creator = self._set_query_creator(index_type) - except ConnectionError: - pass # not sure what to do if ConnectionError does happen. - # it's a deal breaker. should be logged somehow. - - def _build_field_definer(self, flag): - """a private method to build the field definition - - Args - flag (str): an indicate of what the field is in the index - """ - if flag == 'ole': - return create_ole_index_field - else: - raise ValueError("invalid index_type {}".format(flag)) - - def _set_query_creator(self, flag): - """a private method to set the query_creator function of the instance - - Args - flag (str): an indicato of what kind of query construction needs to be done - """ - if flag == 'ole': - return create_ole_query - else: - raise ValueError("invalid index type '{}' for query creation".format(flag)) - - def search(self, query_term, query_field, query_subfield): - """a method to run a search on the index for a particular value in a particular field - - Args: - query_term (str): the string to be searched. This string will be stemmed in Solr searches. - query_field: the label for the MARC21 field that you want to search in. - query_subfield: the label for the relevant subfield of the MARC21 field that you want to search. - - Returns: - list. An iterable containing dictionaries for each matching record in the Solr index - for the query_term, query_field, and query_subfield. - """ - query = None - result = [] - field_name = MarcFieldLookup( - query_field, query_subfield).show_index_field() - if field_name: - query = self.query_creator( - self.field_creator(field_name), query_term) - result = self.solr_index.search(q=query) - else: - query = query_term - result = self.solr_index.search(query) - if result.hits > 0: - self.total = result.hits - print(self.total) - self.records = [x for x in result] - return self.records - else: - return [] - - -class OnDiskSearcher: - """a class to use for building up a list of exported MARC files at a particular location on-disk - - Useage: - searcher = OnDiskSeacher(location='/path/to/marc/records') - searcher.search('Cartographic Mathematical Data', 'Spatial coordinates') - - - """ - def __init__(self, writeable_object=None, location=None): - if location and exists(location): - self.records = self._build_list_of_records(location) - self.total = len(self.records) - elif writeable_object: - validity, records = self._check_if_real_marc_record( - writeable_object.read()) - self.records = records if validity else [] - self.total = len(records) if validity else 0 - self.errors = [] - - def _check_if_real_marc_record(self, some_bytes): - """a method to check of a chunk of bytes is in fact a MARC record - - Returns a tuple, first element is True|False evaluating whether it was a MARC record and - second element is either None or a list of MARC records as dictionaries if first element is True - - :param some_bytes: a chunk of binary data - - :rtype tuple - """ - try: - with BytesIO(some_bytes) as read_file: - reader = MARCReader(read_file) - return (True, [record for record in reader]) - except RecordLengthInvalid: - msg = "not a valid MARC record" - self.errors.append(msg) - return (False, None) - - def count(self): - """a method to return the total number of records extracted - - Returns: - int. total records found on-disk - """ - return self.total - - def _find_marc_files(self, path): - """a generator function to return a list of valid MARC records found from a particular location on-disk - - Args: - path (str): a location on disk to a file or a directory - - Returns: - generator. an interable containing MARC record objects - """ - for n_thing in scandir(path): - if n_thing.is_dir(): - yield from self._find_marc_files(n_thing.path) - elif n_thing.is_file(): - bytes_file = open(n_thing.path, 'rb') - bytes_data = bytes_file.read() - bytes_file.close() - validity, data_package = self._check_if_real_marc_record( - bytes_data) - if validity: - yield data_package - - def _build_list_of_records(self, path_on_disk): - """a method to get a list of MARC records transformed to dictionaries to allow for searching - - Args: - path_on_disk (str): a particular location on-disk - - Returns: - list. an iterable containing dictionaries representing MARC records - """ - records = [] - if isdir(path_on_disk): - for n_package in self._find_marc_files(path_on_disk): - records += [x.as_dict() for x in n_package] - elif isfile(path_on_disk): - bytes_file = open(path_on_disk, 'rb') - bytes_data = bytes_file.close() - bytes_file.close() - validity, data_package = self._check_if_real_marc_record( - bytes_data) - if validity: - records += [record.as_dict() for record in data_package] - return records - - def search(self, query_term, query_field, query_subfield): - """a method to search for records matching query term and field lookup - - Args: - query term (str): a word or phrase that should be present in relevant MARC records - query_field (str): a valid MARC field label - query_subfield (str): a valid MARC subfield label for a subfield associated with the MARC field entered - - Returns: - list. an iterable contianing dicitonaries - - :rtype list - """ - output = [] - field_name = MarcFieldLookup( - query_field, query_subfield).show_index_field() - if field_name: - counter = 0 - for record in self.records: - counter += 1 - for field in record.get("fields"): - if field.get(field_name[0:3]): - subfields = field.get(field_name[0:3]).get("subfields") - for subfield in subfields: - if subfield.get(field_name[-1]): - if query_term in subfield.get(field_name[-1]): - output.append(record) - return output - - @classmethod - def from_flo(cls, flo): - """a method to instantiate an instance of OnDiskExtractor from a file-like object - - Args: - flo (File Object): a file-like object with read, write methods - - Returns: - OnDiskSearcher - """ - return cls(writeable_object=flo) +"""the interface classes to allow for building a list of records and/or searching for relevant records +""" + +from abc import ABCMeta, abstractclassmethod, abstractmethod, abstractproperty +from io import BytesIO +from lxml.etree import XMLParser, XML, tostring as XML_to_string +from os import scandir, stat +from os.path import exists, isfile, isdir +from pymarc import MARCReader +from pymarc.exceptions import RecordLengthInvalid +from pysolr import Solr +from requests import get +from requests.exceptions import ConnectionError +from urllib.parse import ParseResult, quote, unquote +from xml.etree import ElementTree + +from .constants import LOOKUP +from .lookup import MarcFieldLookup +from .utils import create_ole_index_field, create_ole_query + +class SolrIndexSearcher: + """a class to be used to search a Solr index for a query + """ + def __init__(self, index_url, index_type): + """initializes an instance of the class SolrIndexSearcher + + Args: + index_url (str): the URL to the SOLR index that will be queried. + index_type (str): a flag indicating which type of index is being used. + Needed for being able to generate the correct index field name + + """ + try: # have to check if index_url inputted is a resolveable URL + get(index_url, "head") + self.solr_index = Solr(index_url) + self.index_url = self.solr_index.url + self.field_creator = self._build_field_definer(index_type) + self.query_creator = self._set_query_creator(index_type) + except ConnectionError: + pass # not sure what to do if ConnectionError does happen. + # it's a deal breaker. should be logged somehow. + + def _build_field_definer(self, flag): + """a private method to build the field definition + + Args + flag (str): an indicate of what the field is in the index + """ + if flag == 'ole': + return create_ole_index_field + else: + raise ValueError("invalid index_type {}".format(flag)) + + def _set_query_creator(self, flag): + """a private method to set the query_creator function of the instance + + Args + flag (str): an indicato of what kind of query construction needs to be done + """ + if flag == 'ole': + return create_ole_query + else: + raise ValueError("invalid index type '{}' for query creation".format(flag)) + + def search(self, query_term, query_field, query_subfield): + """a method to run a search on the index for a particular value in a particular field + + Args: + query_term (str): the string to be searched. This string will be stemmed in Solr searches. + query_field: the label for the MARC21 field that you want to search in. + query_subfield: the label for the relevant subfield of the MARC21 field that you want to search. + + Returns: + list. An iterable containing dictionaries for each matching record in the Solr index + for the query_term, query_field, and query_subfield. + """ + query = None + result = [] + field_name = MarcFieldLookup( + query_field, query_subfield).show_index_field() + if field_name: + query = self.query_creator( + self.field_creator(field_name), query_term) + result = self.solr_index.search(q=query) + else: + query = query_term + result = self.solr_index.search(query) + if result.hits > 0: + self.total = result.hits + print(self.total) + self.records = [x for x in result] + return self.records + else: + return [] + + +class OnDiskSearcher: + """a class to use for building up a list of exported MARC files at a particular location on-disk + + Useage: + searcher = OnDiskSeacher(location='/path/to/marc/records') + searcher.search('Cartographic Mathematical Data', 'Spatial coordinates') + + + """ + def __init__(self, writeable_object=None, location=None): + if location and exists(location): + self.records = self._build_list_of_records(location) + self.total = len(self.records) + elif writeable_object: + validity, records = self._check_if_real_marc_record( + writeable_object.read()) + self.records = records if validity else [] + self.total = len(records) if validity else 0 + self.errors = [] + + def _check_if_real_marc_record(self, some_bytes): + """a method to check of a chunk of bytes is in fact a MARC record + + Returns a tuple, first element is True|False evaluating whether it was a MARC record and + second element is either None or a list of MARC records as dictionaries if first element is True + + :param some_bytes: a chunk of binary data + + :rtype tuple + """ + try: + with BytesIO(some_bytes) as read_file: + reader = MARCReader(read_file) + return (True, [record for record in reader]) + except RecordLengthInvalid: + msg = "not a valid MARC record" + self.errors.append(msg) + return (False, None) + + def count(self): + """a method to return the total number of records extracted + + Returns: + int. total records found on-disk + """ + return self.total + + def _find_marc_files(self, path): + """a generator function to return a list of valid MARC records found from a particular location on-disk + + Args: + path (str): a location on disk to a file or a directory + + Returns: + generator. an interable containing MARC record objects + """ + for n_thing in scandir(path): + if n_thing.is_dir(): + yield from self._find_marc_files(n_thing.path) + elif n_thing.is_file(): + bytes_file = open(n_thing.path, 'rb') + bytes_data = bytes_file.read() + bytes_file.close() + validity, data_package = self._check_if_real_marc_record( + bytes_data) + if validity: + yield data_package + + def _build_list_of_records(self, path_on_disk): + """a method to get a list of MARC records transformed to dictionaries to allow for searching + + Args: + path_on_disk (str): a particular location on-disk + + Returns: + list. an iterable containing dictionaries representing MARC records + """ + records = [] + if isdir(path_on_disk): + for n_package in self._find_marc_files(path_on_disk): + records += [x.as_dict() for x in n_package] + elif isfile(path_on_disk): + bytes_file = open(path_on_disk, 'rb') + bytes_data = bytes_file.close() + bytes_file.close() + validity, data_package = self._check_if_real_marc_record( + bytes_data) + if validity: + records += [record.as_dict() for record in data_package] + return records + + def search(self, query_term, query_field, query_subfield): + """a method to search for records matching query term and field lookup + + Args: + query term (str): a word or phrase that should be present in relevant MARC records + query_field (str): a valid MARC field label + query_subfield (str): a valid MARC subfield label for a subfield associated with the MARC field entered + + Returns: + list. an iterable contianing dicitonaries + + :rtype list + """ + output = [] + field_name = MarcFieldLookup( + query_field, query_subfield).show_index_field() + if field_name: + counter = 0 + for record in self.records: + counter += 1 + for field in record.get("fields"): + if field.get(field_name[0:3]): + subfields = field.get(field_name[0:3]).get("subfields") + for subfield in subfields: + if subfield.get(field_name[-1]): + if query_term in subfield.get(field_name[-1]): + output.append(record) + return output + + @classmethod + def from_flo(cls, flo): + """a method to instantiate an instance of OnDiskExtractor from a file-like object + + Args: + flo (File Object): a file-like object with read, write methods + + Returns: + OnDiskSearcher + """ + return cls(writeable_object=flo) + +class OLERecordFinder: + """a class to use for finding a particular MARC record from the OLE API + + Useage: + finder = OLERecordFinder("1003495521", "https://example.com/oledocstore") + is_it_there, data = finder.get_record() + if is_it_there: + return data + """ + def __init__(self, bibnumber, ole_domain, ole_scheme, ole_path): + self.identifier = bibnumber + self.record = self._find_record(ole_domain, ole_scheme, ole_path, bibnumber) + + def _find_record(self, ole_domain, ole_scheme, ole_path, bibnumber): + query_param_value = quote("id={}".format(self.identifier)) + query_string = "version=1.2&operation=searchRetrieve&query={}&startRecord=1&maximumRecords=1".format( + query_param_value) + url_object = ParseResult(scheme=ole_scheme, netloc=ole_domain, + path=ole_path, query=query_string, params="", fragment="") + url = url_object.geturl() + data = get(url) + parser = XMLParser(remove_blank_text=True) + if data.status_code == 200: + # handy code for cleaning up newlines and spaces from XML output taken from + # https://stackoverflow.com/questions/3310614/remove-whitespaces-in-xml-string + + xml_doc = ElementTree.fromstring(data.content) + found_records = xml_doc.findall("{http://www.loc.gov/zing/srw/}records/{http://www.loc.gov/zing/srw/}record/{http://www.loc.gov/zing/srw/}recordData/record") + found_records = [ElementTree.tostring(x) for x in found_records] + found_records = [XML_to_string(XML(x, parser=parser)) for x in found_records] + if len(found_records) > 1 or len(found_records) == 0: + raise ValueError("something went wrong: there are multiple records for {}".format(bibnumber)) + else: + return found_records[0] + else: + return None + + def get_record(self): + """a public method to get the matching record (if one was found for the inputted bibnumber) + + Returns: + tuple. first element is boolean result + """ + if self.record: + return (True, self.record) + else: + return (False, None) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9c7eb9f..38fdd75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +lxml pysolr pymarc requests \ No newline at end of file diff --git a/tests/test_spec.py b/tests/test_spec.py index 2e8622e..083d7db 100644 --- a/tests/test_spec.py +++ b/tests/test_spec.py @@ -1,116 +1,130 @@ - -from os import remove, rmdir, getlogin, listdir -from os.path import join -from pymarc import Record, Field -import unittest -from six import BytesIO -from tempfile import TemporaryFile, TemporaryDirectory - -from marcextraction.interfaces import SolrIndexSearcher, OnDiskSearcher -from marcextraction.lookup import MarcFieldLookup -from marcextraction.utils import create_ole_index_field, create_ole_query - - -class Tests(unittest.TestCase): - def setUp(self): - pass - - def testLookupMainEntryPersonalNameIndexField(self): - query = MarcFieldLookup("Main Entry - Personal name", "Personal name") - self.assertEqual(query.show_index_field(), '100a') - - def testLookupCartographicMathDataCoordinates(self): - query = MarcFieldLookup( - "Cartographic Mathematical Data", "Statement of coordinates") - self.assertEqual(query.show_index_field(), '255c') - - def testExtractFromWriteableObject(self): - """ - tests whether the following happens - a.) the extractory builds the right extractor - b.) the built extractor is able to extract the data from a file like object - c.) the right available methods gets returned for retrieving the data - """ - - record = Record() - record.add_field(Field(tag='245', indicators=['0', '1'], - subfields=[ - 'a', 'Test book :', - 'b', 'a simple test object /', - 'c', 'John Doe']) - ) - flo = BytesIO(record.as_marc()) - extracted_data = OnDiskSearcher(writeable_object=flo) - self.assertEqual(extracted_data.count(), 1) - - def testExtractFromDirectoryLocation(self): - tempdir = TemporaryDirectory() - - record1 = Record() - record1.add_field(Field(tag='245', indicators=['0', '1'], - subfields=[ - 'a', 'Test book :', - 'b', 'a simple test object /', - 'c', 'John Doe']) - ) - - record2 = Record() - - record2.add_field(Field(tag='245', indicators=['0', '1'], - subfields=[ - 'a', 'Another test book :', - 'b', 'a second test object /', - 'c', 'Jane Doe']) - ) - with open(join(tempdir.name, 'file1.mrc'), 'wb') as write_file: - write_file.write(record1.as_marc()) - write_file.seek(0) - with open(join(tempdir.name, 'file2.mrc'), 'wb') as write_file: - write_file.write(record2.as_marc()) - write_file.seek(0) - searcher = OnDiskSearcher(location=tempdir.name) - self.assertEqual(searcher.count(), 2) - - def testCreatingSolrIndexSearcher(self): - searcher = SolrIndexSearcher( - "http://olereport02.uchicago.edu:8180/solr/bib/", create_ole_index_field, create_ole_query) - self.assertEqual(searcher.index_url, - 'http://olereport02.uchicago.edu:8180/solr/bib/') - - def testSearchingOnDiscRecords(self): - tempdir = TemporaryDirectory() - - record1 = Record() - record1.add_field(Field(tag='245', indicators=['0', '1'], - subfields=[ - 'a', 'Test book :', - 'b', 'a simple test object /', - 'c', 'John Doe']) - ) - - record2 = Record() - - record2.add_field(Field(tag='245', indicators=['0', '1'], - subfields=[ - 'a', 'Another test book :', - 'b', 'a second test object /', - 'c', 'Jane Doe']) - ) - with open(join(tempdir.name, 'file1.mrc'), 'wb') as write_file: - write_file.write(record1.as_marc()) - write_file.seek(0) - with open(join(tempdir.name, 'file2.mrc'), 'wb') as write_file: - write_file.write(record2.as_marc()) - write_file.seek(0) - - searcher = OnDiskSearcher(location=tempdir.name) - result = searcher.search( - 'John', 'Title Statement', 'Statement of responsibility, etc.') - tempdir.cleanup() - self.assertEqual(len(result), 1) - - def testSearchingVuFind(self): - searcher = SolrIndexSearcher( - "http://olereport02.uchicago.edu:8180/solr/bib/", create_ole_index_field, create_ole_query) - results = searcher.search('Banana', "Title Statement", "Title") - self.assertEqual(len(results), 10) + +from os import remove, rmdir, getlogin, listdir, environ +from os.path import join +from pymarc import Record, Field +import unittest +from six import BytesIO +from tempfile import TemporaryFile, TemporaryDirectory +from urllib.parse import urlparse + +from marcextraction.interfaces import SolrIndexSearcher, OnDiskSearcher, OLERecordFinder +from marcextraction.lookup import MarcFieldLookup +from marcextraction.utils import create_ole_index_field, create_ole_query + +# in order to run tests need to run locally from a computer on the uchicago library subnet to test against library OLE indexes +# in linux/unix issue the following command +# SOLR_INDEX="[uchicago solr index]" OLE_INDEX="[uchicago sru api]" pytest tests/ + +SOLR_INDEX = environ["SOLR_INDEX"] +OLE_INDEX = environ["OLE_INDEX"] + +class Tests(unittest.TestCase): + def setUp(self): + pass + + def testLookupMainEntryPersonalNameIndexField(self): + query = MarcFieldLookup("Main Entry - Personal name", "Personal name") + self.assertEqual(query.show_index_field(), '100a') + + def testLookupCartographicMathDataCoordinates(self): + query = MarcFieldLookup( + "Cartographic Mathematical Data", "Statement of coordinates") + self.assertEqual(query.show_index_field(), '255c') + + def testExtractFromWriteableObject(self): + """ + tests whether the following happens + a.) the extractory builds the right extractor + b.) the built extractor is able to extract the data from a file like object + c.) the right available methods gets returned for retrieving the data + """ + + record = Record() + record.add_field(Field(tag='245', indicators=['0', '1'], + subfields=[ + 'a', 'Test book :', + 'b', 'a simple test object /', + 'c', 'John Doe']) + ) + flo = BytesIO(record.as_marc()) + extracted_data = OnDiskSearcher(writeable_object=flo) + self.assertEqual(extracted_data.count(), 1) + + def testExtractFromDirectoryLocation(self): + tempdir = TemporaryDirectory() + + record1 = Record() + record1.add_field(Field(tag='245', indicators=['0', '1'], + subfields=[ + 'a', 'Test book :', + 'b', 'a simple test object /', + 'c', 'John Doe']) + ) + + record2 = Record() + + record2.add_field(Field(tag='245', indicators=['0', '1'], + subfields=[ + 'a', 'Another test book :', + 'b', 'a second test object /', + 'c', 'Jane Doe']) + ) + with open(join(tempdir.name, 'file1.mrc'), 'wb') as write_file: + write_file.write(record1.as_marc()) + write_file.seek(0) + with open(join(tempdir.name, 'file2.mrc'), 'wb') as write_file: + write_file.write(record2.as_marc()) + write_file.seek(0) + searcher = OnDiskSearcher(location=tempdir.name) + self.assertEqual(searcher.count(), 2) + + def testCreatingSolrIndexSearcher(self): + searcher = SolrIndexSearcher( + SOLR_INDEX, "ole") + self.assertEqual(searcher.index_url, + SOLR_INDEX) + + def testSearchingOnDiscRecords(self): + tempdir = TemporaryDirectory() + + record1 = Record() + record1.add_field(Field(tag='245', indicators=['0', '1'], + subfields=[ + 'a', 'Test book :', + 'b', 'a simple test object /', + 'c', 'John Doe']) + ) + + record2 = Record() + + record2.add_field(Field(tag='245', indicators=['0', '1'], + subfields=[ + 'a', 'Another test book :', + 'b', 'a second test object /', + 'c', 'Jane Doe']) + ) + with open(join(tempdir.name, 'file1.mrc'), 'wb') as write_file: + write_file.write(record1.as_marc()) + write_file.seek(0) + with open(join(tempdir.name, 'file2.mrc'), 'wb') as write_file: + write_file.write(record2.as_marc()) + write_file.seek(0) + + searcher = OnDiskSearcher(location=tempdir.name) + result = searcher.search( + 'John', 'Title Statement', 'Statement of responsibility, etc.') + tempdir.cleanup() + self.assertEqual(len(result), 1) + + def testSearchingVuFind(self): + searcher = SolrIndexSearcher( + SOLR_INDEX, 'ole') + results = searcher.search('Banana', "Title Statement", "Title") + self.assertEqual(len(results), 10) + + def testSearchingOleIndex(self): + url_object = urlparse(OLE_INDEX) + finder = OLERecordFinder("4270571", url_object.netloc, url_object.scheme, url_object.path) + check = finder.get_record() + self.assertEqual(check[0], True) + \ No newline at end of file