# Now that we have the metaindex, let's download the actuall books!

In [40]:
from utils.gutenberg_metadata import gutenberg_to_dict
import pandas as pd

In [41]:
gutenberg = gutenberg_to_dict()

In [42]:
df = pd.DataFrame.from_dict(gutenberg, orient='index')

In [43]:
df = df.astype({'author': 'string'})
df = df.set_index('id')

# Drop erroneous values

In [44]:
df.drop(90907, inplace=True)
df.drop(999999, inplace=True)

# We have 61935 values (that number increases every day)

In [45]:
len(df.author)

61935

# Drop null values
Some books (about 2400) have a null value in the author. If that's the case, it's possible that there are multiple authors (the Bible), or authors are difficult to define (folk stories, etc.). We will skip them.

In [46]:
print(df.author.isnull().sum())
df[df.author.isnull()]

2393


Unnamed: 0_level_0,author,title,downloads,formats,type,LCC,subjects,authoryearofbirth,authoryearofdeath,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10028,,Spalding's Official Baseball Guide - 1913,27.0,{'application/zip': 'https://www.gutenberg.org...,Text,{GV},{Baseball},,,[en]
10085,,Moorish Literature: Comprising Romantic Ballad...,50.0,{'application/zip': 'https://www.gutenberg.org...,Text,{PQ},{Spanish literature -- Translations into English},,,[en]
10103,,"The Great Events by Famous Historians, Volume ...",45.0,{'text/plain': 'https://www.gutenberg.org/file...,Text,{D},{World history},,,[en]
10114,,"The Great Events by Famous Historians, Volume ...",46.0,{'application/x-mobipocket-ebook': 'https://ww...,Text,{D},{World history},,,[en]
10121,,The Literature of Arabia: With Critical and Bi...,34.0,{'text/plain; charset=us-ascii': 'https://www....,Text,{PJ},{Arabic literature},,,[en]
...,...,...,...,...,...,...,...,...,...,...
9893,,Le Comte Ory: Opéra en deux actes,20.0,{'application/x-mobipocket-ebook': 'https://ww...,Text,{ML},{Operas -- Librettos},,,[fr]
9916,,Spalding's Baseball Guide and Official League ...,16.0,{'application/epub+zip': 'https://www.gutenber...,Text,{GV},{Baseball -- Periodicals},,,[en]
9920,,The Garden of Bright Waters: One Hundred and T...,77.0,{'application/epub+zip': 'https://www.gutenber...,Text,{PJ},"{Love poetry, Oriental poetry -- Translations ...",,,[en]
9929,,"The Great Events by Famous Historians, Volume 12",32.0,{'text/plain; charset=us-ascii': 'https://www....,Text,{D},{World history},,,[en]


In [47]:
# drop the entries that are missing the author
df.dropna(subset=['author'], inplace=True)

# Let's see what Gutenber has for Mark Twain

In [48]:
df[df.author.str.contains('Twain')]

Unnamed: 0_level_0,author,title,downloads,formats,type,LCC,subjects,authoryearofbirth,authoryearofdeath,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
102,"Twain, Mark",The Tragedy of Pudd'nhead Wilson,1827.0,{'application/x-mobipocket-ebook': 'https://ww...,Text,{PS},"{Infants switched at birth -- Fiction, Missour...",1835.0,1910.0,[en]
1044,"Twain, Mark",Extract from Captain Stormfield's Visit to Heaven,207.0,{'application/rdf+xml': 'https://www.gutenberg...,Text,{PS},"{Religious fiction, Heaven -- Fiction, Ship ca...",1835.0,1910.0,[en]
1086,"Twain, Mark",A Horse's Tale,279.0,{'application/epub+zip': 'https://www.gutenber...,Text,{PS},{Fiction},1835.0,1910.0,[en]
11622,"Twain, Mark",Plus fort que Sherlock Holmès,56.0,{'application/rdf+xml': 'http://www.gutenberg....,Text,{PS},"{Short stories, American fiction -- Translatio...",1835.0,1910.0,[fr]
119,"Twain, Mark",A Tramp Abroad,1639.0,{'application/x-mobipocket-ebook': 'https://ww...,Text,{PS},"{Europe -- Fiction, Americans -- Europe -- Fic...",1835.0,1910.0,[en]
...,...,...,...,...,...,...,...,...,...,...
9040,"Twain, Mark",A Tramp Abroad,4.0,{'audio/mpeg': 'http://www.gutenberg.org/files...,Sound,{PS},"{Europe -- Fiction, Americans -- Europe -- Fic...",1835.0,1910.0,[en]
9041,"Twain, Mark",What Is Man? and Other Essays,6.0,{'text/html; charset=iso-8859-1': 'http://www....,Sound,{PS},{Essays},1835.0,1910.0,[en]
9042,"Twain, Mark",Extracts from Adam's Diary,11.0,{'audio/mpeg': 'http://www.gutenberg.org/files...,Sound,{PS},{Bible. Genesis -- History of Biblical events ...,1835.0,1910.0,[en]
91,"Twain, Mark",Tom Sawyer Abroad,288.0,{'application/epub+zip': 'https://www.gutenber...,Text,{PS},"{Balloon ascensions -- Fiction, Americans -- F...",1835.0,1910.0,[en]


# Now let's read something

In [49]:
import numpy as np
authors_works = np.array(df[df.author.str.contains('Twain')].index)

In [50]:
authors_works[5]

1213

In [51]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

text = strip_headers(load_etext(1213).strip())

In [52]:
df.loc[1213]

author                                                     Twain, Mark
title                                The Man That Corrupted Hadleyburg
downloads                                                          697
formats              {'application/x-mobipocket-ebook': 'https://ww...
type                                                              Text
LCC                                                               {PS}
subjects             {Humorous stories, American, United States -- ...
authoryearofbirth                                                 1835
authoryearofdeath                                                 1910
language                                                          [en]
Name: 1213, dtype: object

In [53]:
text[:1000]

"\n\n\n\n\nTranscribed from the 1907 Chatto & Windus edition by David Price, email\nccx074@coventry.ac.uk\n\n\n\n\n\nTHE MAN THAT CORRUPTED HADLEYBURG\n\n\nI.\n\n\nIt was many years ago.  Hadleyburg was the most honest and upright town\nin all the region round about.  It had kept that reputation unsmirched\nduring three generations, and was prouder of it than of any other of its\npossessions.  It was so proud of it, and so anxious to insure its\nperpetuation, that it began to teach the principles of honest dealing to\nits babies in the cradle, and made the like teachings the staple of their\nculture thenceforward through all the years devoted to their education.\nAlso, throughout the formative years temptations were kept out of the way\nof the young people, so that their honesty could have every chance to\nharden and solidify, and become a part of their very bone.  The\nneighbouring towns were jealous of this honourable supremacy, and\naffected to sneer at Hadleyburg's pride in it and 