Here's some bash stuff I did to parse to pull out the relevant data from the large dump files. The resulting author data file seems too small. Not sure what happened with that.
```
grep -Po 'OL[0-9]+A' artificial_intelligence_works.txt | uniq > ai-authors-id.txt
grep -f ai-authors.txt ol_dump_editions_2019-01-31.txt >  ai-authors.tsv
grep -Po 'OL[0-9]+W' artificial_intelligence_works.txt | uniq > ai-works-id.txt
grep -f ai-works-id.txt  ol_dump_editions_2019-01-31.txt > ai-editions.tsv
```

## Overview
Data source: https://openlibrary.org/developers/dumps
The data dumps from openlibrary.org are huge so to make the project manageable, we limited the content to books on articial intelligence. To accomplish this, all three dump files (ol_dump_works_2019_01-31.txt, ol_dump_editions_2019-01-31.txt, and ol_dump_editions_2019-01-31.txt were downloaded from openlibrary.org, then processed as follows:

```
grep -i 'artificial\ intelligence' ol_dump_works_2019_01-31.txt > artificial_intelligence_works.txt
grep -Po 'OL[0-9]+A' artificial_intelligence_works.txt | uniq > ai-authors-id.txt
grep -f ai-authors.txt ol_dump_editions_2019-01-31.txt >  ai-authors.tsv
grep -Po 'OL[0-9]+W' artificial_intelligence_works.txt | uniq > ai-works-id.txt
grep -f ai-works-id.txt  ol_dump_editions_2019-01-31.txt > ai-editions.tsv
```

The code below parses these files and extracts the data needed for the database. 

### Step 1: Process 'works' data
Inputs: 
../data_output/artificial_intelligence_works.txt

Outputs: 
../data_output/works.csv
../data_output/author_works.csv
../data_output/works_subject.csv
../data_output/subjects.csv

In [None]:
import csv
import json
import pandas as pd
doc = "../data_output/artificial_intelligence_works.txt"
df = pd.read_csv(doc, sep='\t', header=None)

# add column names
df.columns = ['type', 'path', 'revisions', 'timestamp', 'details']

# get 'work_id' from 'path'
df['work_id'] = df.path.str[7:]

# create a list containing only the id and json data
works = df[['work_id', 'details']].values.tolist()

# initialize some lists to hold the data that will get savedin csv files
titles = [] # work_id and title
work_authors = [] # work_id and author_id
subjects = [] # temporary storage for work_id and subject
subject_tbl = [] # subject_id 
work_subject = [] #

# Loop through 'works' and pull out the pieces of data we want in the database
for work in works:
    details = json.loads(work[1])
    
    # append title to list
    titles.append([work[0], details["title"]])
    
    # append authors to list - not all works have authors!
    try:
        for a in details["authors"]:
            # work_authors.append([works[0], json.dumps(a["author"]["key"][9:])])
            # author_id = json.dumps(a["author"]["key"][9:])
            work_authors.append([work[0], a["author"]["key"][9:]])
    except:
        # print(json.dumps(details))
        continue
        
    # append subjects to list
    try:
        for s in details["subjects"]:
            subjects.append([work[0], s])
    except:
        # print(json.dumps(details))
        continue

# Create a set from the subjects list  
unique_subjects = set()
for subject in subjects:
    unique_subjects.add(subject[1])
    
# Generate a subject id for each subject in the set.
n = 1
for u in sorted(unique_subjects):
    subject_tbl.append([str(n), u])
    n += 1

# Create a dictionary to allow subject lookups
subject_id = {}
for s in subject_tbl:
    subject_id[s[1]] = s[0] 

# Create a list of lists containing pairs of subject id's and work id's    

for s in subjects:
    work_subject.append([s[0], subject_id[s[1]]])

# Write "titles" to csv
with open("../data_output/works.csv", 'w', encoding='utf-8') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(titles)

# Write "work_authors" to csv    
with open("../data_output/author_works.csv", 'w', encoding='utf-8') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(work_authors)
    
with open("../data_output/work_subjects.csv", 'w', encoding='utf-8') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(work_subject)    
    
with open("../data_output/subjects.csv", 'w', encoding='utf-8') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(subject_tbl)   

### Step 2: Process 'authors' data
Inputs: 
../data_output/ai-authors.tsv

Outputs:
../data_output/authors.csv

In [43]:
doc = "../data_output/ai-authors.tsv"
df = pd.read_csv(doc, sep='\t', header=None)

# add column names
df.columns = ['type', 'path', 'revisions', 'timestamp', 'details']

# get 'work_id' from 'path'
df['author_id'] = df.path.str[9:]

# create a list of lists containing only the id and json data
authors = df[['author_id', 'details']].values.tolist()

author_ids = []

for author in authors:
    details = json.loads(author[1])
    
    # append title to list
    author_ids.append([author[0], details["name"]])

with open("../data_output/authors.csv", 'w', encoding='utf-8') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(author_ids)
       

In [41]:
author_tbl

[['OL1031112A', 'Dietmar Zaefferer'],
 ['OL2766532A', 'Patrick Blackburn'],
 ['OL2783933A', 'Terry Dartnall'],
 ['OL325790A',
  'Australian Joint Conference on Artificial Intelligence (8th 1995 Canberra A.C.T.)'],
 ['OL393180A', 'Irwin R. Goodman'],
 ['OL422488A', 'Bremer KI-Pfingstworkshop (2nd 1996 Bremen, Germany)'],
 ['OL4433951A', 'Martyn Thomas Quigley'],
 ['OL444439A', 'Philip J. Hilts'],
 ['OL4865567A', 'James G. Allen'],
 ['OL492713A',
  'DFKI Workshop on Taxonomic Reasoning (1992 Saarbrücken, Germany)'],
 ['OL5254836A',
  'International Conference on Systems Research, Informatics, and Cybernetics (6th 1992 Baden-Baden, Germany)'],
 ['OL580058A', 'Jerry M. Mendel'],
 ['OL6115415A',
  'International Workshop on Temporal Representation and Reasoning (3rd 1996 Key West, Fla.)'],
 ['OL6300983A',
  'International Conference on Cognitive Technology (2nd 1997 Aizu-Wakamatsu City, Japan)'],
 ['OL6956313A', 'Yuri Iserlis'],
 ['OL6993062A', 'V. M. Kureĭchik'],
 ['OL7124964A', 'Allan Ha

### Step 3: Process 'editions' data
Inputs: 
../data_output/ai-editions.tsv

Outputs:


In [92]:
doc = "../data_output/ai-editions.tsv"
df = pd.read_csv(doc, sep='\t', header=None)

# add column names
df.columns = ['type', 'path', 'revisions', 'timestamp', 'details']

# get 'work_id' from 'path'
df['edition_id'] = df.path.str[7:]

# create a list of lists containing only the id and json data
editions_master = df[['edition_id', 'details']].values.tolist()

edition_work = [] # edition_id, work_id pairs
edition_pages = []
edition_isbn10 = []
edition_isbn13 = []
edition_title = []
edition_physical_format = []

for edition in editions_master:
    details = json.loads(edition[1])
    
    # get work_ids
    edition_work.append([edition[0], details['works'][0]['key'][7:]])
    
    # get number of pages
    if 'number_of_pages' in details.keys():
        edition_pages.append([edition[0], details['number_of_pages']])
        
    # get isbn10
    if 'isbn_10' in details.keys():
        edition_isbn10.append([edition[0], details['isbn_10']])
    
    # get isbn13
    if 'isbn_13' in details.keys():
        edition_isbn13.append([edition[0], details['isbn_13']])

    # get title
    if 'title' in details.keys():
        edition_title.append([edition[0], details['title']])

    # get format
    if 'physical_format' in detail.keys():
        edition_physical_format.append([edition[0], details['physical_format']])
        
    
    
#with open("../data_output/edition_work.csv", 'w', encoding='utf-8') as myfile:
#    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#    wr.writerows(edition_work)

In [None]:
editions_master[0]

In [94]:
json.loads(editions_master[700][1])

{'subtitle': '13th International Conference, IPMU 2010, Dortmund, Germany, June 28–July 2, 2010. Proceedings, Part II',
 'links': [{'url': 'http://dx.doi.org/10.1007/978-3-642-14058-7'}],
 'series': ['Communications in Computer and Information Science -- 81'],
 'latest_revision': 1,
 'ocaid': 'informationproce00hlle',
 'contributions': ['Hoffmann, Frank',
  'Kruse, Rudolf',
  'SpringerLink (Online service)'],
 'source_records': ['ia:informationproce00hlle'],
 'title': 'Information Processing and Management of Uncertainty in Knowledge-Based Systems. Applications',
 'languages': [{'key': '/languages/eng'}],
 'subjects': ['Information storage and retrieval systems',
  'Information systems',
  'Computer science',
  'Artificial intelligence',
  'Database management'],
 'location': ['MiU'],
 'by_statement': 'edited by Eyke Hüllermeier, Rudolf Kruse, Frank Hoffmann',
 'type': {'key': '/type/edition'},
 'publish_country': 'gw ',
 'publishers': ['Springer-Verlag Berlin Heidelberg'],
 'physical_