In [1]:
import numpy as np
import pandas as pd
import os, sys
import glob

from collections import Counter
import matplotlib.pyplot as plt

In [2]:
gutenberg_repo_path = '/home/dean/Documents/gitRepos/gutenberg_p'
gutenberg_analysis_repo = '/home/dean/Documents/gitRepos/gutenberg-analysis'

In [3]:
## import internal helper functions
analysis_src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(analysis_src_dir)
from data_io import get_book

gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query
from jsd import jsdalpha

# Read a book

We load a single pg-book.
We can select one of three different levels of granularity:

- counts, gives a dictionary of {word: count}
- tokens, gives a list of tokens (str)
- text, gives the text as a single string

In [4]:
## select yuor favorite book
pg_id = 'PG2701' ## moby dick

In [5]:
level = 'counts'
dict_word_count = get_book(pg_id, path_gutenberg=gutenberg_repo_path, level=level)
dict_word_count

{'the': 14332,
 'of': 6592,
 'and': 6340,
 'a': 4608,
 'to': 4577,
 'in': 4131,
 'that': 2906,
 'his': 2514,
 'it': 2339,
 'i': 1842,
 'but': 1737,
 'he': 1721,
 'as': 1715,
 'with': 1715,
 'is': 1698,
 'was': 1635,
 'for': 1594,
 'all': 1462,
 'this': 1353,
 'at': 1307,
 'not': 1192,
 'by': 1190,
 'from': 1088,
 'on': 1039,
 'him': 1036,
 'be': 1034,
 'so': 1033,
 'whale': 936,
 'one': 886,
 'you': 846,
 'had': 777,
 'have': 763,
 'now': 753,
 'there': 740,
 'or': 710,
 'were': 680,
 'they': 651,
 'which': 637,
 'their': 619,
 'some': 610,
 'then': 608,
 'me': 607,
 'when': 592,
 'are': 591,
 'an': 588,
 'my': 574,
 'like': 572,
 'upon': 562,
 'no': 541,
 'into': 522,
 'out': 513,
 'up': 507,
 'more': 501,
 'what': 493,
 'if': 464,
 'them': 458,
 'old': 440,
 'we': 431,
 'would': 427,
 'man': 425,
 'ahab': 418,
 'been': 415,
 'ye': 411,
 'over': 402,
 'other': 397,
 'these': 395,
 'will': 383,
 'ship': 380,
 'sea': 374,
 'its': 373,
 'only': 368,
 'such': 366,
 'though': 361,
 'down':

In [6]:
level = 'tokens'
list_tokens = get_book(pg_id, path_gutenberg=gutenberg_repo_path, level=level)
list_tokens

['or',
 'the',
 'whale',
 'by',
 'herman',
 'melville',
 'contents',
 'etymology',
 'extracts',
 'supplied',
 'by',
 'a',
 'chapter',
 'loomings',
 'chapter',
 'the',
 'chapter',
 'the',
 'chapter',
 'the',
 'counterpane',
 'chapter',
 'breakfast',
 'chapter',
 'the',
 'street',
 'chapter',
 'the',
 'chapel',
 'chapter',
 'the',
 'pulpit',
 'chapter',
 'the',
 'sermon',
 'chapter',
 'a',
 'bosom',
 'friend',
 'chapter',
 'nightgown',
 'chapter',
 'biographical',
 'chapter',
 'wheelbarrow',
 'chapter',
 'nantucket',
 'chapter',
 'chowder',
 'chapter',
 'the',
 'ship',
 'chapter',
 'the',
 'ramadan',
 'chapter',
 'his',
 'mark',
 'chapter',
 'the',
 'prophet',
 'chapter',
 'all',
 'astir',
 'chapter',
 'going',
 'aboard',
 'chapter',
 'merry',
 'christmas',
 'chapter',
 'the',
 'lee',
 'shore',
 'chapter',
 'the',
 'advocate',
 'chapter',
 'postscript',
 'chapter',
 'knights',
 'and',
 'squires',
 'chapter',
 'knights',
 'and',
 'squires',
 'chapter',
 'ahab',
 'chapter',
 'enter',
 'aha

# Metadata

A short inftroduction to querying the metadata.
For this we use the meta_query-class which is contained as part of the gutenberg-repo

- mq.df gives a pandas dataframe

In [7]:
mq = meta_query(path=os.path.join(gutenberg_repo_path,'metadata','metadata.csv'))
mq.df.head()


Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
0,PG10000,The Magna Carta,Anonymous,,,['en'],418,"{'Magna Carta', 'Constitutional history -- Eng...",Text
1,PG10001,Apocolocyntosis,"Seneca, Lucius Annaeus",,65.0,['en'],767,"{'Claudius, Emperor of Rome, 10 B.C.-54 A.D. -...",Text
2,PG10002,The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],806,{'Science fiction'},Text
3,PG10003,"My First Years as a Frenchwoman, 1876-1879","Waddington, Mary King",1833.0,1923.0,['en'],94,"{'France -- History -- Third Republic, 1870-19...",Text
4,PG10004,The Warriors,"Lindsay, Anna Robertson Brown",1864.0,1948.0,['en'],65,{'Christianity'},Text


Get only English books

In [8]:
print(f'Pre-filter shape: {mq.df.shape}')
## we apply a filter
mq.filter_lang('en',how='only')
print(f'Post-filter shape: {mq.df.shape}')

Pre-filter shape: (19236, 9)
Post-filter shape: (15008, 9)


In [9]:
list_pg_ids = mq.get_ids()
list_pg_ids[:10]

['PG10000',
 'PG10001',
 'PG10002',
 'PG10003',
 'PG10004',
 'PG10005',
 'PG10006',
 'PG10007',
 'PG10008',
 'PG10009']

#### filter all books from a given author


In [10]:
author = 'Shakespeare, William'

In [11]:
## first we have to reset the previous filter
mq.reset()
mq.filter_author(author)
mq.df

  s = self.df[ self.df['author'].str.contains(re.escape(s_sel),case=False).replace(np.nan,False)]


Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
110,PG100,The Complete Works of William Shakespeare,"Shakespeare, William",1564.0,1616.0,['en'],50807,{'English drama -- Early modern and Elizabetha...,Text
465,PG1041,Shakespeare's Sonnets,"Shakespeare, William",1564.0,1616.0,['en'],1978,"{'Sonnets, English', 'English poetry'}",Text
509,PG1045,Venus and Adonis,"Shakespeare, William",1564.0,1616.0,['en'],424,"{'Venus (Roman deity) -- Poetry', 'Adonis (Gre...",Text
672,PG10606,"The Tragedie of Hamlet, Prince of Denmark: A S...","Shakespeare, William",1564.0,1616.0,['en'],142,"{'Kings and rulers -- Succession -- Drama', ""M...",Text
1118,PG1100,The First Part of Henry the Sixth,"Shakespeare, William",1564.0,1616.0,['en'],155,"{'Great Britain -- History -- Henry VI, 1422-1...",Text
...,...,...,...,...,...,...,...,...,...
40807,PG46768,Julius Cæsar,"Shakespeare, William",1564.0,1616.0,['la'],270,"{'Assassins -- Drama', 'Tragedies', 'Caesar, J...",Text
41641,PG47518,Shakespeare's Comedy of The Tempest,"Shakespeare, William",1564.0,1616.0,['en'],148,"{'Shipwreck victims -- Drama', 'Islands -- Dra...",Text
41860,PG47715,The Works of William Shakespeare [Cambridge Ed...,"Shakespeare, William",1564.0,1616.0,['en'],177,{'English drama'},Text
42080,PG47913,Makbeto,"Shakespeare, William",1564.0,1616.0,['eo'],126,"{'Macbeth, King of Scotland, active 11th centu...",Text


# Bookshelves

The bookshelves data is organized as a dataframe

- rows are books (i)
- columns are bookshelves (j)
- if B[i,j] = True --> book i belongs to bookshelf j

Note that only a fraction of books are assigned to any of the bookshelves

In [None]:
df_bookshelves_cat = pd.read_pickle(os.path.join(gutenberg_repo_path,'metadata','bookshelves_categories_dict.pkl'))

In [None]:
df_bookshelves_ebook = pd.read_pickle(os.path.join(gutenberg_repo_path,'metadata','bookshelves_ebooks_dict.pkl'))

In [None]:
df_bookshelves_cat

In [None]:
df_bookshelves_ebook