In [22]:
import os 
import re
from stemming.porter2 import stem
import pprint


In [23]:

# Break a book into paragraphs, turn each paragraph into words and consider only 
# those that contain more than 10 words
# and do not include any of the bad_chars 
def filterBook(filepath):   
#     introductions = ['Introduction','INTRODUCTION','Chapter 1','CHAPTER 1','PART I','Part I','ONE','One','PROLOGUE','Prologue']
    bad_chars = ['#','<','>','*','_',':','\n']

    filtered_pars = []
    quotes = []
    with open(filepath, 'r',encoding='utf-8') as read_file:
        
        text = read_file.read()
        paragraphs = text.split('\n\n')
        count=0
        for i,par in enumerate(paragraphs):
            if len(re.findall('\w+', par))>10 and not any(char in par for char in bad_chars):
                filtered_pars.append(par)

        for par in filtered_pars:
            quotes += [quote.strip() for quote in par.split('.')]
        
        print(f"Number of quotes in the book is {len(quotes)}")
    read_file.close()
#     print(quotes)
    return quotes


In [24]:
f=open("englishST.txt","r")
stopwords = [word for line in f for word in line.split()]
stopSet=set(stopwords)
f.close()

In [25]:
#Custom class to repressent quotes
# WIll probably contain more fields i.e a list of positions for a term?
class Quote:
    def __init__(self,id,length):
        self.id = id
        self.length = length
    
    def __str__(self):
        return f'Quote({self.id,self.length})'
    
    def __repr__(self):
        return str(self)


In [26]:

# Preprocessing and update of index

def preprocess(document):
    words = [stem(word.lower()) for word in re.findall('\w+', document) if not word.lower() in stopSet]
    return words

# Given the current index, the book and its quotes, update the collection index
def appendToIndex(index,bookname,quotes):
    # Index structure { term ->[docCount, {bookName -> [Quote]}, termFrequency] }
    for quoteId,quote in enumerate(quotes):
        terms=preprocess(quote)
        for term in terms: 
            if term in index:
                index[term][2] +=1 # update term freq

                # each term has a dictionary with books and quotes 
                if bookname not in index[term][1]:  # if bookname is not registered for this term 
                    index[term][0]+=1 # increase doc freq
                    index[term][1][bookname] = [Quote(quoteId,len(quote))]  # update set with docID
                else: # bookname exist in the dictionary
                    index[term][1][bookname].append(Quote(quoteId,len(quote))) # update the Quote list
                    
            else:
                index[term]=[]
                # doc frequency is 1. 
                index[term].append(1) 
                # The bookDictionary is initially empty. 
                index[term].append({})   
                # Initialize bookname entry 
                index[term][1][bookname] = [Quote(quoteId,len(quote))]
                # term frequency
                index[term].append(1)
    
    return index

In [29]:
def findISBN(filepath):
    f = open(filepath, "r",encoding='utf-8')
    text = f.read()

    # Try first with 'ISBN' included
    regex0 = re.compile("(?:ISBN(?:-1[03])?:? ?)(?:978(?:-|\s)?|979(?:-|\s)?)?(?:[0-9]{1,5}(?:-|\s)?)(?:[0-9]{1,7}(?:-|\s)?)(?:[0-9]{1,6}(?:-|\s)?)(?:[0-9X]{1}(?:-|\s)?)")    
    # Try without 'ISBN' and hyphen seperated
    regex1 = re.compile("(?:978(?:-)?|979(?:-)?)?(?:[0-9]{1,5}(?:-))(?:[0-9]{1,7}(?:-))(?:[0-9]{1,6}(?:-))(?:[0-9X]{1})")
    # Try without 'ISBN' and not seperated (must be of length 10 or 13; if 13, must start with 978 or 979)
    regex2 = re.compile("(?:978|979)?(?:[0-9]{9})(?:[0-9X]{1})")
    # Try without 'ISBN' and space seperated and ISBN-13 ONLY (sacrifice space seperated 10)
    regex3 = re.compile("(?:978\s|979\s)(?:[0-9]{1,5}\s)(?:[0-9]{1,7}\s)(?:[0-9]{1,6}\s)(?:[0-9X]{1})")

    if regex0.search(text):
        match = regex0.search(text).group(0)
        if ":" in match:
            return match.split(":")[1].strip()
        else:
            return match.split("ISBN")[1].strip()
    elif regex1.search(text):
        match = regex1.search(text).group(0)
        return match
    elif regex2.search(text):
        match = regex2.search(text).group(0)
        return match 
    elif regex3.search(text):
        match = regex3.search(text).group(0)
        return match
    return False


In [192]:

def readBooksDirectory():
    directory = r"C:/Users/Erodotos/Desktop/Year 4/TTDS/group-project/Book3"
#     folders = ['7', 'X', 'Y', 'Z']
    folders = ['Z']

    collection_idx = {}
    for folder in folders:
        subdir = directory + '/' + folder
        for filename in os.listdir(subdir):
            if filename.endswith(".txt"):
                filepath = subdir + '/' + filename
                authorIncluded = True if "-" in filename else False
                title = filename if not authorIncluded else filename.split("-")[0].strip()
                author = False if not authorIncluded else filename.split("-")[1].split(".")[0].strip()
                ISBN = findISBN(filepath)
                
                print(filename)
                print(ISBN)

                if ISBN and len(ISBN) >=10:
                    categories = getCategories(ISBN,filename) 
                    print(categories)
                else:
                    print('ERROR: No ISBN FOUND ')
                book_name = f"{title}-{author}"
                book_quotes = filterBook(filepath)
#                 collection_idx = appendToIndex(collection_idx,book_name,book_quotes)
                
                print('-'*100)
                

    
    pprint.pprint(collection_idx)


readBooksDirectory()

Z - Therese Anne Fowler.epub.txt
978-1-250-02865-5
https://www.googleapis.com/books/v1/volumes?q=isbn9781250028655
item idx 1
Title Z: A Novel of Zelda Fitzgerald
Authors ['Therese Fowler']
['Fiction']
Number of quotes in the book is 8915
----------------------------------------------------------------------------------------------------
Z Score Neurofeedback - Thatcher, Robert W., Lubar, Joe.epub.txt
978-0-12-801291-8
https://www.googleapis.com/books/v1/volumes?q=isbn9780128012918
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 3369
----------------------------------------------------------------------------------------------------
Zabor - Kamel Daoud.epub.txt
978-2-330-08644-2
https://www.googleapis.com/books/v1/volumes?q=isbn9782330086442
No match for given ISBN!
item idx 0
Title Zabor
Authors ['Kamel Daoud']
['Fiction']
Number of quotes in the book is 1995
------------------------------------------------------------------------------------------

item idx 0
Title Zagreb Noir
Authors ['Ivan Srsen']
['Croatia']
Number of quotes in the book is 4310
----------------------------------------------------------------------------------------------------
Zahav - Michael Solomonov.epub.txt
978-0-544-37328-0
https://www.googleapis.com/books/v1/volumes?q=isbn9780544373280
item idx 0
Title Zahav
Authors ['Michael Solomonov', 'Steven Cook']
['Cooking']
Number of quotes in the book is 4250
----------------------------------------------------------------------------------------------------
Zahn - Triplet.epub.txt
978-1-4532-7207-7
https://www.googleapis.com/books/v1/volumes?q=isbn9781453272077
item idx 0
Title Triplet
Authors ['Timothy Zahn']
['Fiction']
Number of quotes in the book is 9097
----------------------------------------------------------------------------------------------------
Zahn_Conquerors.Heritage.epub.txt
978-0-307-82242-0
https://www.googleapis.com/books/v1/volumes?q=isbn9780307822420
item idx 0
Title Conquerors' Heritage
Aut

Zanthodon Megapack, The - Lin Carter.epub.txt
False
ERROR: No ISBN FOUND 
Number of quotes in the book is 18331
----------------------------------------------------------------------------------------------------
Zapata - John Steinbeck.epub.txt
978-0-14-192351-2
https://www.googleapis.com/books/v1/volumes?q=isbn9780141923512
item idx 0
Title Zapata
Authors ['John Steinbeck']
['Fiction']
Number of quotes in the book is 2398
----------------------------------------------------------------------------------------------------
Zapatos italianos - Henning Mankell.epub.txt
978-84-8383-881-5
https://www.googleapis.com/books/v1/volumes?q=isbn9788483838815
item idx 0
Title Zapatos italianos
Authors ['Henning Mankell']
['Fiction']
Number of quotes in the book is 8143
----------------------------------------------------------------------------------------------------
Zapped - Ann Louise Gittleman (2010).epub.txt
9780062014436
https://www.googleapis.com/books/v1/volumes?q=isbn9780062014436
item id

item idx 0
Title Zen 24/7
Authors ['Philip T. Sudo']
['Religion']
Number of quotes in the book is 1230
----------------------------------------------------------------------------------------------------
Zen and Gone - Emily France.epub.txt
978-1-61695-857-2
https://www.googleapis.com/books/v1/volumes?q=isbn9781616958572
item idx 0
Title Zen and Gone
Authors ['Emily France']
['JUVENILE FICTION']
Number of quotes in the book is 9192
----------------------------------------------------------------------------------------------------
Zen and the Art of Creating Esc - Paula Y_.epub.txt
False
ERROR: No ISBN FOUND 
Number of quotes in the book is 160
----------------------------------------------------------------------------------------------------
Zen and the Art of Faking It - Jordan Sonnenblick.epub.txt
978-0-439-83707-1
https://www.googleapis.com/books/v1/volumes?q=isbn9780439837071
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 2795
---------------

item idx 0
Title Zen Essence
Authors ['Thomas Cleary']
['Religion']
Number of quotes in the book is 1989
----------------------------------------------------------------------------------------------------
Zen food - Laure Kie.epub.txt
9782317011269
https://www.googleapis.com/books/v1/volumes?q=isbn9782317011269
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 1246
----------------------------------------------------------------------------------------------------
Zen for Christians - Kim Boykin.epub.txt
9780486824406
https://www.googleapis.com/books/v1/volumes?q=isbn9780486824406
Title Zen for Christians
Authors ['Kim Boykin']
['Religion']
Number of quotes in the book is 1550
----------------------------------------------------------------------------------------------------
Zen Gardens - Mira Locher.epub.txt
978-1-4629-1049-6
https://www.googleapis.com/books/v1/volumes?q=isbn9781462910496
item idx 0
Title Zen Gardens
Authors ['Mira Locher']
['Garde

item idx 0
Title Zen Poems of China and Japan
Authors ['Lucien Stryk']
['Poetry']
Number of quotes in the book is 642
----------------------------------------------------------------------------------------------------
Zen Poetry - Lucien Stryk.epub.txt
978-0-8021-9824-2
https://www.googleapis.com/books/v1/volumes?q=isbn9780802198242
item idx 0
Title Zen Poetry
No authors found!
['Poetry']
Number of quotes in the book is 184
----------------------------------------------------------------------------------------------------
Zen Puppies - Gautama Buddha _ the editors of Mango Media.epub.txt
False
ERROR: No ISBN FOUND 
Number of quotes in the book is 0
----------------------------------------------------------------------------------------------------
Zen Putting - Joseph Parent.epub.txt
978-1-1012-1675-0
https://www.googleapis.com/books/v1/volumes?q=isbn9781101216750
item idx 0
Title Zen Putting
Authors ['Joseph Parent']
['Sports & Recreation']
Number of quotes in the book is 3310
-----

item idx 1
Title Zero Belly Diet
Authors ['David Zinczenko']
['Health & Fitness']
Number of quotes in the book is 2156
----------------------------------------------------------------------------------------------------
Zero Belly Smoothies - David Zinczenko.epub.txt
9780399178443
https://www.googleapis.com/books/v1/volumes?q=isbn9780399178443
Title Zero Belly Smoothies
Authors ['David Zinczenko']
['Health & Fitness']
Number of quotes in the book is 1181
----------------------------------------------------------------------------------------------------
Zero Bomb - M.T Hill.epub.txt
9781789090017
https://www.googleapis.com/books/v1/volumes?q=isbn9781789090017
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 5099
----------------------------------------------------------------------------------------------------
Zero Carbon Car - Brian Long.epub.txt
978 1 84797 514 0
https://www.googleapis.com/books/v1/volumes?q=isbn9781847975140
item idx 0
Title Zero

item idx 0
Title Zero to Zillionaire
Authors ['Chellie Campbell']
['Business & Economics']
Number of quotes in the book is 3562
----------------------------------------------------------------------------------------------------
Zero Tolerance - Claudia Mills.epub.txt
978-0-374-33312-6
https://www.googleapis.com/books/v1/volumes?q=isbn9780374333126
item idx 0
Title Zero Tolerance
Authors ['Claudia Mills']
['Juvenile Fiction']
Number of quotes in the book is 4272
----------------------------------------------------------------------------------------------------
Zero Waste Engineering - Khan and Islam (2016).epub.txt
978-1-119-18489-8
https://www.googleapis.com/books/v1/volumes?q=isbn9781119184898
Title Green Energy
Authors ['Suman Lata Tripathi', 'Sanjeevikumar Padmanaban']
['Science']
Number of quotes in the book is 8120
----------------------------------------------------------------------------------------------------
Zero World - Jason M. Hough.epub.txt
978-0-553-39126-8
https://ww

No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 4575
----------------------------------------------------------------------------------------------------
Zhong Guo Shu Fa Quan Ji (Tao Z - Tang Shu Tong.epub.txt
False
ERROR: No ISBN FOUND 
Number of quotes in the book is 1147
----------------------------------------------------------------------------------------------------
Zhong Hua Ci Yuan (China_s Etymology) - Ya Se.epub.txt
9787510422096
https://www.googleapis.com/books/v1/volumes?q=isbn9787510422096
item idx 0
Title 中华词源
Authors ['雅瑟', '青萍']
['Chinese language']
Number of quotes in the book is 639
----------------------------------------------------------------------------------------------------
Zhong Hua Ying Yang Bai Wei Con - Yuan Tang Xin.epub.txt
False
ERROR: No ISBN FOUND 
Number of quotes in the book is 341
----------------------------------------------------------------------------------------------------
Zhong Yao Wang Guo De Ge Ming 

item idx 0
Title Zigzagging Down a Wild Trail
Authors ['Bobbie Ann Mason']
['Fiction']
Number of quotes in the book is 5523
----------------------------------------------------------------------------------------------------
Zika - Donald G. McNeil.epub.txt
978-0-393-60914-1
https://www.googleapis.com/books/v1/volumes?q=isbn9780393609141
Title Zika: The Emerging Epidemic
Authors ['Donald G. McNeil']
['Science']
Number of quotes in the book is 2907
----------------------------------------------------------------------------------------------------
Zilch - Nancy Lublin.epub.txt
978-1-4081-4615-6
https://www.googleapis.com/books/v1/volumes?q=isbn9781408146156
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 3282
----------------------------------------------------------------------------------------------------
Zillow Talk - Spencer Rascoff.epub.txt
978-1-4555-7476-6
https://www.googleapis.com/books/v1/volumes?q=isbn9781455574766
item idx 0
Title Zillow

item idx 0
Title Halo
Authors ['Zizou Corder']
['Juvenile Fiction']
Number of quotes in the book is 7885
----------------------------------------------------------------------------------------------------
Zlata_s Diary - Zlata Filipovic.epub.txt
9781101006979
https://www.googleapis.com/books/v1/volumes?q=isbn9781101006979
item idx 0
Title Zlata's Diary
Authors ['Zlata Filipovic']
['Biography & Autobiography']
Number of quotes in the book is 3106
----------------------------------------------------------------------------------------------------
Zodiac Cracked - Marianne Koerfer.epub.txt
978-0-7414-8092-7
https://www.googleapis.com/books/v1/volumes?q=isbn9780741480927
item idx 0
Title Zodiac Cracked
Authors ['Marianne Koerfer']
['Social Science']
Number of quotes in the book is 971
----------------------------------------------------------------------------------------------------
Zodiac Unmasked - Robert Graysmith.epub.txt
9781440678127
https://www.googleapis.com/books/v1/volumes?q=is

item idx 0
Title Zom-B
Authors ['Darren Shan']
['Young Adult Fiction']
Number of quotes in the book is 3818
----------------------------------------------------------------------------------------------------
ZOM-B Angels.epub.txt
9781443415194
https://www.googleapis.com/books/v1/volumes?q=isbn9781443415194
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 3218
----------------------------------------------------------------------------------------------------
ZOM-B Baby - Darren Shan.epub.txt
9781443415224
https://www.googleapis.com/books/v1/volumes?q=isbn9781443415224
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 2808
----------------------------------------------------------------------------------------------------
Zom-B Clans - Darren Shan.epub.txt
978-0-316-21430-8
https://www.googleapis.com/books/v1/volumes?q=isbn9780316214308
item idx 0
Title Zom-B Clans
Authors ['Darren Shan']
['Young Adult Fiction']
Num

item idx 0
Title Zombie Queen of Newbury High
Authors ['Amanda Ashby']
['Young Adult Fiction']
Number of quotes in the book is 4042
----------------------------------------------------------------------------------------------------
Zombie Spaceship Wasteland - Patton Oswalt.epub.txt
978-1-4391-4908-9
https://www.googleapis.com/books/v1/volumes?q=isbn9781439149089
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 2118
----------------------------------------------------------------------------------------------------
Zombie University - Sinead Murphy.epub.txt
978-1-910924-51-8
https://www.googleapis.com/books/v1/volumes?q=isbn9781910924518
No match for given ISBN!
No match for given ISBN!
[]
Number of quotes in the book is 406
----------------------------------------------------------------------------------------------------
Zombie Versus Fairy Featuring Albinos - James Marshall.epub.txt
978-1-92748-142-7
https://www.googleapis.com/books/v1/volumes?q

item idx 0
Title Zone
Authors ['Guillaume Apollinaire']
['Poetry']
Number of quotes in the book is 674
----------------------------------------------------------------------------------------------------
Zone - Mathias Enard.epub.txt
False
ERROR: No ISBN FOUND 
Number of quotes in the book is 794
----------------------------------------------------------------------------------------------------
Zone Journals - Charles Wright.epub.txt
9781429933568
https://www.googleapis.com/books/v1/volumes?q=isbn9781429933568
item idx 0
Title Zone Journals
Authors ['Charles Wright']
['Poetry']
Number of quotes in the book is 2
----------------------------------------------------------------------------------------------------
Zone Meals in Seconds - Barry Sears.epub.txt
9780061758058
https://www.googleapis.com/books/v1/volumes?q=isbn9780061758058
item idx 0
Title Zone Meals in Seconds
Authors ['Barry Sears']
['Health & Fitness']
Number of quotes in the book is 1955
-----------------------------------

item idx 1
Title Zoot-Suit Murders
Authors ['Thomas Sanchez']
['Fiction']
Number of quotes in the book is 3935
----------------------------------------------------------------------------------------------------
ZooZical - Judy Sierra.epub.txt
978-0-375-98473-0
https://www.googleapis.com/books/v1/volumes?q=isbn9780375984730
item idx 0
Title ZooZical
Authors ['Judy Sierra']
['Juvenile Fiction']
Number of quotes in the book is 0
----------------------------------------------------------------------------------------------------
Zora and Langston_A Story of Friendship and Betrayal (Norton) - Yuval Taylor (retail).epub.txt
9780393243918
https://www.googleapis.com/books/v1/volumes?q=isbn9780393243918
Title Zora and Langston: A Story of Friendship and Betrayal
Authors ['Yuval Taylor']
['Biography & Autobiography']
Number of quotes in the book is 4034
----------------------------------------------------------------------------------------------------
Zora Neale Hurston_ A Life in Letters (Anc

item idx 0
Title A Grain of Truth
Authors ['Zygmunt Miloszewski']
['Fiction']
Number of quotes in the book is 7179
----------------------------------------------------------------------------------------------------
ZZ Top - The Very Best of ZZ Top (Songbook).epub.txt
9781458485298
https://www.googleapis.com/books/v1/volumes?q=isbn9781458485298
item idx 0
Title The Very Best of ZZ Top (Songbook)
Authors ['ZZ Top']
['Music']
Number of quotes in the book is 0
----------------------------------------------------------------------------------------------------
ZZT - Anna Anthropy.epub.txt
978-1-940535-02-9
https://www.googleapis.com/books/v1/volumes?q=isbn9781940535029
item idx 0
Title ZZT
Authors ['Anna Anthropy']
['Computer games']
Number of quotes in the book is 1199
----------------------------------------------------------------------------------------------------
{}


In [None]:
https://www.googleapis.com/books/v1/volumes?q=isbn978-1-5040-4002-0

In [190]:
# importing the requests library 
import requests 

def getCategories(isbn,filename):

    # api-endpoint 
    URL = "https://www.googleapis.com/books/v1/volumes?q=isbn"
    ISBN = isbn.translate({ord(i): None for i in ' -:'}) # remove space, dashes or semicolons from isbn
    categories = []

    # location given here 
    request_url = URL + ISBN
    print(request_url)
    
    # Account for occasional failure of get request even though the book might exist
    for i in range(2):

        # sending get request and saving the response as response object 
        r = requests.get(url = request_url) 

        # extracting data in json format 
        data = r.json()
        items = 0
        
        # sometimes due to server error, the totalItems tag cannot be accessed
        # However, the normal case is that even if a book is not registered in the API, the
        # totalItems tag will be 0
        try:
            items = data['totalItems']
        except:
            print("No totalItems tag found, probably due to server error")
        
        if items>0:
            
            # Sometimes, the first item returned from the API is not the book with the given isbn
            # Search through the items list provided to find which item is the correct one 
            item_idx = 0
            for j in range(items):
                # industryIdentifiers tag might not exist
                try:
                    isbn10 = data['items'][j]['volumeInfo']['industryIdentifiers'][0]['identifier']
                    isbn13 = data['items'][j]['volumeInfo']['industryIdentifiers'][1]['identifier']
#                 
                    if (ISBN == isbn10 or ISBN == isbn13):
                        item_idx = j
                        print('item idx',item_idx)
                        break
                except:
                    print("No registered ISBNs found!")
            
            title = data['items'][item_idx]['volumeInfo']['title']
            print('Title',title)
            
            # author tag might not exist
            try:
                authors = data['items'][item_idx]['volumeInfo']['authors']
                print('Authors',authors)
            except:
                print("No authors found!") 
               
            # categories tag might not exist
            try:
                categories = data['items'][item_idx]['volumeInfo']['categories']
            except:
                print("No categories found!")
                
            break
        else:
            print("No match for given ISBN!")
        
    return categories

getCategories('978-1-84749-308-8',"")

https://www.googleapis.com/books/v1/volumes?q=isbn9781847493088
item idx 1
Title A Journey Around My Room
Authors ['Xavier de Maistre']


['Fiction']