In [None]:
#Application of frictionless principles to the 'Texas_NewspaperTitleList_final' 
#output is JSON file describing the Newspaper List csv file

#Please download frictionless at frictionlessdata.io/software/#software-toolkit

In [30]:
#import frictionless packages
import frictionless
from frictionless import describe

import pprint
pp = pprint.PrettyPrinter(depth=4)

In [31]:
#'describe' datasets and print
NP_Titles = describe('https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv')

pp.pprint(NP_Titles)

{'encoding': 'utf-8',
 'format': 'csv',
 'hashing': 'md5',
 'name': 'tx_newspaper-titles',
 'path': 'https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv',
 'profile': 'tabular-data-resource',
 'schema': {'fields': [{'name': 'title', 'type': 'string'},
                       {'name': 'lccn', 'type': 'string'},
                       {'name': 'start_year', 'type': 'integer'},
                       {'name': 'end_year', 'type': 'integer'},
                       {'name': 'frequency', 'type': 'string'},
                       {'name': 'frequency_normalized', 'type': 'string'},
                       {'name': 'language', 'type': 'string'},
                       {'name': 'county', 'type': 'string'},
                       {'name': 'place', 'type': 'string'},
                       {'name': 'place_of_publication', 'type': 'string'}]},
 'scheme': 'https'}


In [32]:
#applying more detail to each column
NP_Titles.schema.get_field("title").title = "Newspaper Title"
NP_Titles.schema.get_field("title").description = "Titles of newspapers as documented by Chronicaling America"

NP_Titles.schema.get_field("lccn").title = "LCCN"
NP_Titles.schema.get_field("lccn").description = "Library of Congress Control Number"

NP_Titles.schema.get_field("start_year").title = "Start Year"
NP_Titles.schema.get_field("start_year").description = "Year of first publication"

NP_Titles.schema.get_field("end_year").title = "End Year"
NP_Titles.schema.get_field("end_year").description = "Year of last publication"

NP_Titles.schema.get_field("frequency").title = "Frequency of Publication - Original"
NP_Titles.schema.get_field("frequency").description = "Frequency of publication as originally documented by Chronicaling America"

NP_Titles.schema.get_field("frequency_normalized").title = "Frequency of Publication - Normalized"
NP_Titles.schema.get_field("frequency_normalized").description = "Frequency of publication manually noramlized to include only primary headings (e.g., daily, weekly)"

NP_Titles.schema.get_field("language").title = "Language of Publciation"
NP_Titles.schema.get_field("language").description = "Language or languages of publication as originally documented by Chronicaling America"

NP_Titles.schema.get_field("county").title = "Distribution County or Counties"
NP_Titles.schema.get_field("county").description = "The Texas county or counties in which each newspaper distributes publications - extracted via excel from original 'place' variable"

NP_Titles.schema.get_field("place").title = "Place - Original"
NP_Titles.schema.get_field("place").description = "Place of distribution as originally documented by Chronicaling America"

NP_Titles.schema.get_field("place_of_publication").title = "Place of Publication"
NP_Titles.schema.get_field("place").description = "Place(s)where newspapers are published as originally documented by Chronicaling America"

pp.pprint(NP_Titles)

{'encoding': 'utf-8',
 'format': 'csv',
 'hashing': 'md5',
 'name': 'tx_newspaper-titles',
 'path': 'https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv',
 'profile': 'tabular-data-resource',
 'schema': {'fields': [{'description': 'Titles of newspapers as documented by '
                                       'Chronicaling America',
                        'name': 'title',
                        'title': 'Newspaper Title',
                        'type': 'string'},
                       {'description': 'Library of Congress Control Number',
                        'name': 'lccn',
                        'title': 'LCCN',
                        'type': 'string'},
                       {'description': 'Year of first publication',
                        'name': 'start_year',
                        'title': 'Start Year',
                        'type': 'integer'},
                       {'description': 'Year of last publi

In [33]:
#apply constraint field descriptors to the tables
NP_Titles.schema.get_field("frequency_normalized").constraints["enum"] = ['Daily', 'Semiweekly', 'Weekly', 'Semimonthly', 'Monthly', 'Bimonthly', 'Quarterly', 'Annual', 'Irregular', 'Missing']

pp.pprint(NP_Titles)

{'encoding': 'utf-8',
 'format': 'csv',
 'hashing': 'md5',
 'name': 'tx_newspaper-titles',
 'path': 'https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv',
 'profile': 'tabular-data-resource',
 'schema': {'fields': [{'description': 'Titles of newspapers as documented by '
                                       'Chronicaling America',
                        'name': 'title',
                        'title': 'Newspaper Title',
                        'type': 'string'},
                       {'description': 'Library of Congress Control Number',
                        'name': 'lccn',
                        'title': 'LCCN',
                        'type': 'string'},
                       {'description': 'Year of first publication',
                        'name': 'start_year',
                        'title': 'Start Year',
                        'type': 'integer'},
                       {'description': 'Year of last publi

In [34]:
#handling missing values
NP_Titles.schema.missing_values = ["", "1000", "9999", "Missing"]
pp.pprint(NP_Titles)

{'encoding': 'utf-8',
 'format': 'csv',
 'hashing': 'md5',
 'name': 'tx_newspaper-titles',
 'path': 'https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv',
 'profile': 'tabular-data-resource',
 'schema': {'fields': [{'description': 'Titles of newspapers as documented by '
                                       'Chronicaling America',
                        'name': 'title',
                        'title': 'Newspaper Title',
                        'type': 'string'},
                       {'description': 'Library of Congress Control Number',
                        'name': 'lccn',
                        'title': 'LCCN',
                        'type': 'string'},
                       {'description': 'Year of first publication',
                        'name': 'start_year',
                        'title': 'Start Year',
                        'type': 'integer'},
                       {'description': 'Year of last publi

In [35]:
#add table description
NP_Titles.description = "The titles, lccn, years of activity, frequency and place of distribution, and place of publication of Texas nespapers throughout history as documented by Chronicling America https://chroniclingamerica.loc.gov/"
pp.pprint(NP_Titles)

{'description': 'The titles, lccn, years of activity, frequency and place of '
                'distribution, and place of publication of Texas nespapers '
                'throughout history as documented by Chronicling America '
                'https://chroniclingamerica.loc.gov/',
 'encoding': 'utf-8',
 'format': 'csv',
 'hashing': 'md5',
 'name': 'tx_newspaper-titles',
 'path': 'https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv',
 'profile': 'tabular-data-resource',
 'schema': {'fields': [{'description': 'Titles of newspapers as documented by '
                                       'Chronicaling America',
                        'name': 'title',
                        'title': 'Newspaper Title',
                        'type': 'string'},
                       {'description': 'Library of Congress Control Number',
                        'name': 'lccn',
                        'title': 'LCCN',
                    

In [36]:
#print JSON file
NP_Titles.to_json("NP_Titles.json")

'{\n  "path": "https://raw.githubusercontent.com/unt-libraries/portal-leading/main/data/TX_Newspaper-Titles/TX_Newspaper-Titles.csv",\n  "name": "tx_newspaper-titles",\n  "profile": "tabular-data-resource",\n  "scheme": "https",\n  "format": "csv",\n  "hashing": "md5",\n  "encoding": "utf-8",\n  "schema": {\n    "fields": [\n      {\n        "name": "title",\n        "type": "string",\n        "title": "Newspaper Title",\n        "description": "Titles of newspapers as documented by Chronicaling America"\n      },\n      {\n        "name": "lccn",\n        "type": "string",\n        "title": "LCCN",\n        "description": "Library of Congress Control Number"\n      },\n      {\n        "name": "start_year",\n        "type": "integer",\n        "title": "Start Year",\n        "description": "Year of first publication"\n      },\n      {\n        "name": "end_year",\n        "type": "integer",\n        "title": "End Year",\n        "description": "Year of last publication"\n      },\n  

In [37]:
ls

 Volume in drive C has no label.
 Volume Serial Number is 3A43-426E

 Directory of C:\Users\Dream Machine

10/18/2021  09:33 PM    <DIR>          .
10/18/2021  09:33 PM    <DIR>          ..
02/16/2020  06:16 PM    <DIR>          .anaconda
07/18/2021  02:01 PM    <DIR>          .atom
05/25/2021  05:27 PM    <DIR>          .conda
10/17/2021  10:42 PM                60 .condarc
02/16/2020  06:16 PM    <DIR>          .config
09/02/2021  10:07 PM               177 .gitconfig
12/13/2019  10:57 AM    <DIR>          .idlerc
10/18/2021  11:12 AM    <DIR>          .ipynb_checkpoints
02/16/2020  06:16 PM    <DIR>          .ipython
10/18/2021  11:03 AM    <DIR>          .jupyter
10/18/2021  07:45 PM    <DIR>          .librarymanager
02/16/2020  06:16 PM    <DIR>          .matplotlib
05/18/2019  10:54 AM    <DIR>          .Origin
10/17/2021  08:48 PM    <DIR>          .PyCharmCE2019.3
05/18/2019  10:54 AM    <DIR>          .QtWebEngineProcess
02/19/2020  09:40 PM    <DIR>          .spyder-py3
02/09