![Mind Map of Hackathon Project Mashup](images/Mind%20map%20with%20lines.png)




# DATA REQUIREMENTS #

1. Utilize Hathi Trust data APIs including new FEATURES API

2. Mash up Open Syllabi data

3. Mash up Open Library data

4. Mash up Goodreads data


------



# OVERALL PROCESS #

1. First, we ensure we have all Python libraries necessary

2. We utilize a list from a GitHub project using data from Open Syllabus (https://raw.githubusercontent.com/mtdamir/open-syllabus/master/urls.csv) and grab the "Classical" entries.

3. We grab the JSON file from Open Syllabus for each entry

4. We search the Open Syllabus JSON record for either "id_hathi_trust" or "isbns"
    1. If there is no HATHI TRUST ID, we grab only the *first* ISBN as proof of concept

5. We query the HATHI TRUST metadata API using either the HATHI TRUST ID or ISBNS from step 

6. We query the GOODREADS API (deprecated but still working) to get ranking of book if available using GOODREADS link from OPEN SYLLABUS json 

7. We count tokens, sentences, lines, and syllables per volume to calculate Fleshman Readability and Grade Level scores using HATHI TRUST FEATURES json

8. We visualize the academic domains where the volume has appeared using OPEN SYLLABUS json record


------

# OVERALL GOALS #

1. Determine how Hathi Trust books are used in Syllabi for educational purposes

2. Determine what academic domains are using the documents

3. Deteremine how difficult the documents are to read




In [None]:
!pip install htrc-feature-reader
!pip install syllaby
# !pip install htrc
# !pip install isbnlib

## STEP ONE ##

1. import libraries needed

2. write functions for assisting with processing

3. load TXT file (list of CLASSICS documents from Open Syllabus)
    1. count number of lines in file



In [None]:
from lxml import etree
import os
from urllib.request import urlopen
import json
from htrc_features import *
from htrc import workset, metadata
from time import sleep
from tqdm import tqdm
from isbnlib import mask, to_isbn10, is_isbn10, is_isbn13
import re
import linecache
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import syllapy
from plotnine import *

def get_link(link):
    """ 
    Retrieve online document

    Args:
        link (STR): valid URL
    
    Return:
        r (OBJ): request object
    """
    
    r = requests.get(link)
    return(r)

def get_htid(link):
    """
    Grab htids, year, and title from JSON record

    Args:
        link (STR): link to HATHI TRUST API
        
    Return:
        matched (BOOL): did we find document in HATHI TRUST DATASET
        htids (LIST or empty list): all htids we find
        year (LIST or 0): all publishDates we find
        title (LIST or ""): all titles we find     
    
    """
    r = get_link(link)
    data = r.json()
    if len(data["items"]) > 0:
        matched = True
        htids = [(value['htid']) for value in data["items"]]
        year = [(value['publishDates']) for value in data["records"].values()][0]
        title = [(value['titles']) for value in data["records"].values()][0]
    else:
        matched = False
        htids = []
        year = 0
        title = ""
      
    return(htids, year, title, matched)

def get_subject(link):
    """
    Grab MARC subject codes from HATHI TRUST metadata API and use loc.gov/MARC21 to grab apporpiate fields

    Args:
        link (STR): HATHI TRUST metadata API link
        
    Return:
        subject_list (LIST): subjects from MARC record and retrieve appropriate info from loc.gov/MARC21
        subject_no (LIST):  subject codes from MARC record 
    """
    r = get_link(link)
    data = r.json()
    result = [(value['marc-xml']) for value in data["records"].values()][0]
    result = result.replace('encoding="UTF-8"', "")
    root = etree.fromstring(result)
    
    subject_list = []
    for record in root:
      fields = ['600', '610', '611', '630', '650', '651', '655']
      subject_no = 0
      for field in fields:
        query = "".join(['{http://www.loc.gov/MARC21/slim}datafield[@tag="', field, '"]'])
        subjects = record.findall(query)
        for item in subjects:
          subfields = item.findall('{http://www.loc.gov/MARC21/slim}subfield')
          Ind2 = item.get("ind2")
          if Ind2 == "0":
            for s in subfields:
              if s.get('code') == 'a':
                  value = s.text
                  subject_list.append(value)
                  
    return([subject_list, subject_no])


# Calulate Flesch–Kincaid readability tests
def calculate_readability(num_words, num_sentences, num_syllables):
    """
    Calulate Flesch–Kincaid readability tests and grade level

    Args:
        num_words (INT): number of tokens for volume
        num_sentences (INT): number of sentences for volume
        num_syllables (INT): number of syllables for volume

    Returns:
        readability_score (FLOAT): Flesch–Kincaid readability score
        grade_level (FLOAT): Flesch–Kincaid grade level score
    """

    readability_score = 206.835 - 1.015 * (num_words/num_sentences) - 84.6 * (num_syllables/num_words)
    grade_level = 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables/num_words) - 15.59
    
    return readability_score, grade_level


In [None]:
### GET OUR STARTING FILE ###

# Open file and read line by line
file1 = open('/content/classics_list.txt', 'r')

# count lines in file
count = sum( 1 for line in file1 )
#print(count)

In [None]:
# set up blank variables
data_summary = []
htid_lists = []
subject_lists = []

for i in tqdm(range(1, count)):
    
    # Get next line
    url = linecache.getline('/content/classics_list.txt', i)
    json_file_name = url.split("/")[5].split(".")[0]

    # if line is empty
    # end of file is reached
    if not url:
        break

    # open url and store response
    response = urlopen(url)

    # storing the JSON response
    # from url in data
    data_json = json.loads(response.read())
    
    # iterating through JSON 
    if data_json['availability']['Open Library']:
        

        # grab open library URL
        #https://openlibrary.org/search.json

        open_lib_url = data_json['availability']['Open Library']
        #print(open_lib_url)
        
        # change URL to give us JSON record
        new_open_lib_url = open_lib_url.replace('search?', 'search.json?')

        # request Open Library data
        r = requests.get(new_open_lib_url)
        ol_data = r.json()

        # store file
        ol_filename = "".join(['./open_library_json/', json_file_name, '.json'])
        try:
          with open(ol_filename, 'w+') as file:
            json.dump(ol_data, file)
        except FileExistsError:
          print(f"The file '{file_path}' already exists.")


        # store the response of URL
        response = urlopen(new_open_lib_url)
        data_json = json.loads(response.read())

        # see what we have (HATHI TRUST ID -or- ISBNS)
        try:
          rnumbers = data_json['docs'][0]['id_hathi_trust']
        except:
          rnumbers = []
        try:
          isbns = data_json['docs'][0]['isbn']
        except:
          isbns = []

        # see how many HATHI TRUST and ISBNS we have
        rnumber_count = len(rnumbers)
        isbn_count = len(isbns)

        # HATHI TRUST ID
        if rnumber_count > 0:
          print(rnumbers[0])
          link = "".join(["https://catalog.hathitrust.org/api/volumes/full/recordnumber/",
                          rnumbers[0], ".json"])
        # ISBNS
        elif isbn_count > 0:
          #print(isbns[0])
          link = "".join(["https://catalog.hathitrust.org/api/volumes/full/isbn/",
                          isbns[0], ".json"])
        #print(link)
        
        # Grab htids, year, and title from JSON record using our function
        htids = get_htid(link)

        # grab JSON file
        r = requests.get(link)
        data = r.json()

        # save JSON file
        ht_filename = "".join(['./hathitrust_json/', json_file_name, '.json'])
        try:
          with open(ht_filename, 'w+') as file:
            json.dump(data, file)
        except FileExistsError:
          print(f"The file '{file_path}' already exists.")


        # just to summarize our data
        data_summary.append({
            "osurl":url,
            "ht_link":link,
            "ht_matched":htids[3],
            "htid_count":len(htids[0]),
            "year":htids[1],
            'title':htids[2],
            "subject_no":subjects[1]
        })


        # Get SUBJECTS
        if htids[3] == True:

          subjects = get_subject(link)

          for item in htids[0]:
              htid_lists.append({
                  "osurl":url,
                  "ht_link":link,
                  "htid":item
              })
          for item in subjects[0]:
            subject_lists.append({
                "osurl":url,
                "subject":item
            })


        # sleep because of Open Syllabus API
        time.sleep(2)

# close our file
file1.close()


### SUMMARIZE WHAT WE'VE ACCOMPLISHED ###
# use Pandas dataframe
data_summary_df = pd.DataFrame(data_summary)

print(len(data_summary))
print(sum(data_summary_df['ht_matched']))
print(len(htid_lists))
print(len(subject_lists))

data_summary_df.head()

htid_df = pd.DataFrame(htid_lists)
htid_df.head()

subject_df = pd.DataFrame(subject_lists)
subject_df.head()

data_summary_df.to_csv("data_summary.csv")
htid_df.to_csv("htid_list.csv")
subject_df.to_csv("subject_list.csv")

## STEP TWO ##

1. Utilize JSON files saved in STEP ONE to analyze our dataset

2. Determine total number of academic domains where each document has appeared and graph

3. Determine readability score and grade level score for all documents

4. Determine subject lists for each book and see how subjects map to academic domains





In [None]:

# VISUALIZE BAR CHART OF ACADEMIC DOMAINS using OPEN SYLLABUS API data
keys, vals = [], []
data = data_json['syllabiByField']
for i in data:
    keys.append(i['name'])
    vals.append(i['count'])

fig = plt.figure(figsize = (10, 5))

# creating the bar plot
plt.bar(keys, vals, color ='maroon', 
        width = 0.4)

plt.xlabel("Domain")
plt.ylabel("No. of syllabi")
plt.title(f"Use of {book_title} across domains in syllabi")

# Add the values of each bar to the chart
for i, v in enumerate(vals):
    plt.text(i, v + 5, str(v))

plt.xticks(rotation=90)
plt.show()

!["Open Syllabus areas for documen"](images/bar_chart_openSyllabus.png "Open Syllabus areas for document")

In [None]:
#VISUALIZE GOODREADS STAR RANKING#
def draw_stars(n):
    """
    Draw stars for ranking representation

    Args:
        n (FLOAT): Goodreads score ranking
    """
    for i in range(round(n)):
        plt.plot(i, 0, marker='*', markersize=20, color='magenta')


# get goodreads ranking
goodreads_ranking = float(data_goodreads_json['books'][0]['average_rating'])
draw_stars(goodreads_ranking)

# add title to visualization
plt.title(f"Goodreads Ranking {goodreads_ranking}")
ax = plt.gca()

# remove frame box
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
# Hide X and Y axes label marks
ax.xaxis.set_tick_params(labelbottom=False)
ax.yaxis.set_tick_params(labelleft=False)

# Hide X and Y axes tick marks
ax.set_xticks([])
ax.set_yticks([])

# Set the axes limits
plt.xlim(-1, 15)
plt.ylim(-1, 2)

# Show the plot
plt.show()  

![alt text](images/goodreads_rating.png "Goodreads Rating for document")

In [None]:
# Count all tokens, sentences, lines for each document
# Count syllables for each token and add them up
# Calculate Flesch–Kincaid readability tests

syllapy_count, token_count, sentence_count, syllable_count, line_count = 0,0,0,0,0
for page in data_hathi_ef_json['data']['features']['pages']:

    body = page.get('body')
    if type(body) is dict:
        if body['tokenCount']:
            token_count += int(body['tokenCount'])
        if body['sentenceCount']:
            sentence_count += int(body['sentenceCount'])
        if body['lineCount']: 
            line_count += int(body['lineCount'])

        tokens = list(body['tokenPosCount'].keys())

        
        for token in tokens:
                syllapy_count += syllapy.count(token)


#print("\n"*4)
#print(token_count)
#print(sentence_count)
#print(line_count)
#print(syllapy_count)

readability, grade_level = calculate_readability(token_count, sentence_count, syllapy_count)        
print(f"READABILITY SCORE: {readability}")
print(f"GRADE LEVEL: {grade_level}")


![alt text](images/ReadingScores.png "Document reading score and grade level")

In [None]:
# Visualize Subjects from Hathi Trust MARC21 record #
subject = pd.read_csv("subject_list.csv")
subject = subject.drop_duplicates()

subject['subject_new'] = [re.sub('\.','', str(x)) for x in subject['subject']]
subject['subject_new'].value_counts().reset_index().head(20)

![alt text](images/subject_across_all_matched.png "All LCSH for all matched documents")

In [None]:
# plot matched documents by year
f1 = (
ggplot(data, aes(x = "year_new", y = "..count..")) +
    geom_bar(width = 1) +
    xlim(1900, 2020) +
    labs(x = 'Publication Year', y = "Counts",
         title = "Publication Years of Books Matched to HathiTrust") +
    theme_linedraw()
)
f1
ggsave(f1, "f1.jpg", dpi = 400, width = 5, height = 3.5)

![alt text](images/f1.jpg "All matched documents across years")