In [52]:
import pandas as pd
import re
import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

import plotly.express as px

## Exploring Available Books

To start with, figure out which books were not downloaded, but are present in the metadata csv!

I made a copy of the RAW texts just in case.

In [2]:
books_list = []

for name in glob.glob('../data/raw/*'):
    books_list.append(re.findall(r'PG\d*', name)[0])

In [3]:
library = pd.read_csv('../data/metadata.csv')

In [4]:
len(library) - len(books_list)

3435

There are 3435 "books" listed in the metadata that do not get downloaded. Next up, to explore why.

In [5]:
library.loc[~library['id'].isin(books_list)]['type'].value_counts(dropna=False)

NaN            2215
Sound          1104
Dataset          83
Image            33
MovingImage       7
StillImage        3
Collection        1
Text              1
Name: type, dtype: int64

Starting with those that are marked as 'type' being NaN. It is possible either the flags are incorrect (I checked it with "The King James Version of the Bible"), or there is something else going on that is causing this issue. Might have to look into NaNs a little bit more.

In [6]:
library.loc[(~library['id'].isin(books_list)) & (library['type'].isna())]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
606,PG10547,Topsy-Turvy,"Verne, Jules",1828.0,1905.0,['en'],126,"{'Science fiction, French -- Translations into...",
703,PG10634,"The Queen of Hearts, and Sing a Song for Sixpence","Caldecott, Randolph",1846.0,1886.0,['en'],44,"{'Picture books for children', 'Nursery rhymes...",
841,PG10762,Impressions of Theophrastus Such,"Eliot, George",1819.0,1880.0,['en'],110,"{'Authors -- Fiction', 'England -- Fiction', '...",
923,PG10836,The Algebra of Logic,"Couturat, Louis",1868.0,1914.0,['en'],97,"{'Logic, Symbolic and mathematical', 'Algebrai...",
1106,PG10,The King James Version of the Bible,,,,['en'],5831,{'Bible'},
...,...,...,...,...,...,...,...,...,...
70441,PG9995,Little Journey to Puerto Rico: For Intermediat...,"George, Marian Minnie",1865.0,,['en'],12,{'Puerto Rico -- Description and travel'},
70442,PG9996,"""'Tis Sixty Years Since"": Address of Charles F...","Adams, Charles Francis",1835.0,1915.0,['en'],12,"{'Philosophy, Modern'}",
70443,PG9997,"France and England in North America, Part III:...","Parkman, Francis",1823.0,1893.0,['en'],34,{'Canada -- History -- To 1763 (New France)'},
70445,PG9999,"Harriet, the Moses of Her People","Bradford, Sarah H. (Sarah Hopkins)",1818.0,1912.0,['en'],103,"{'Slaves -- United States -- Biography', 'Afri...",


For 'Sound' it is pretty straightforward. I'm only looking for boox and not for audio files.

In [7]:
library.loc[(~library['id'].isin(books_list)) & (library['type'] == 'Sound')]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
151,PG10137,Mary Had a Little Lamb: Recording taken from M...,"Edison, Thomas A. (Thomas Alva)",1847.0,1931.0,['en'],21,"{'Nursery rhymes, American'}",Sound
168,PG10152,Voice Trial - Kinetophone actor audition,"Lett, Bob",,,['en'],4,{'Auditions'},Sound
169,PG10153,Voice Trial - Kinetophone Actor Audition,"Lenord, Frank",,,['en'],4,{'Auditions'},Sound
170,PG10154,Voice Trial - Kinetophone Actor Audition,"Schultz, Siegfried Von",,,['en'],0,{'Auditions'},Sound
171,PG10155,The Right of the People to Rule,"Roosevelt, Theodore",1858.0,1919.0,['en'],9,"{'Progressivism (United States politics)', 'Po...",Sound
...,...,...,...,...,...,...,...,...,...
70159,PG9740,Tom Tiddler's Ground,"Dickens, Charles",1812.0,1870.0,['en'],6,{'English fiction'},Sound
70160,PG9741,The Uncommercial Traveller,"Dickens, Charles",1812.0,1870.0,['en'],6,{'England -- Social life and customs -- 19th c...,Sound
70161,PG9742,The Wreck of the Golden Mary,"Dickens, Charles",1812.0,1870.0,['en'],3,"{'Sea stories', 'Shipwrecks -- Fiction', 'Gold...",Sound
70162,PG9743,Sketches of Young Couples,"Dickens, Charles",1812.0,1870.0,['en'],3,"{'Couples -- England', 'England -- Social life...",Sound


Next up, looking into datasets. It appears the vast majority of them are genomes. There are 10 calculations of square roots and 1/pi to a million digits. And 'Moby Word Lists' is just info on gutenberg, disclaimers, etc...

In [8]:
library.loc[(~library['id'].isin(books_list)) & (library['type'] == 'Dataset')].groupby('author').count()

Unnamed: 0_level_0,id,title,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Bonnell, Jerry T.",2,2,0,0,2,2,2,2
"De Forest, Norman L.",1,1,0,0,1,1,1,1
Human Genome Project,72,72,0,0,72,72,72,72
"Kanada, Yasumasa",1,1,1,1,1,1,1,1
"Kerr, Stan",1,1,0,0,1,1,1,1
"Nemiroff, Robert J.",5,5,0,0,5,5,5,5
"Ward, Grady",1,1,1,0,1,1,1,1


Onto checking out the images! the Image contains music sheets. MovingImage contains comets video, rotating earth and 5 nuclear test videos. StillImages contain a kids story illustrated and two maps/ map images.

In [9]:
library.loc[(~library['id'].isin(books_list)) & (library['type'] == 'Image')]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
1108,PG11001,String Quartet No. 05 in A major Opus 18,"Beethoven, Ludwig van",1770.0,1827.0,['en'],5,"{'Music', 'String quartets -- Scores'}",Image
1109,PG11002,"String Quartet No. 11 in F minor Opus 95 ""Seri...","Beethoven, Ludwig van",1770.0,1827.0,['en'],6,"{'String quartets -- Scores', 'Music'}",Image
1944,PG11755,String Quartet No. 10 in E flat major Opus 74 ...,"Beethoven, Ludwig van",1770.0,1827.0,['en'],15,"{'Music', 'String quartets -- Scores'}",Image
2381,PG12149,String Quartet No. 03 in D major Opus 18,"Beethoven, Ludwig van",1770.0,1827.0,['en'],15,"{'String quartets -- Scores', 'Music'}",Image
2479,PG12237,String Quartet No. 16 in F major Opus 135,"Beethoven, Ludwig van",1770.0,1827.0,['en'],21,"{'Music', 'String quartets -- Scores'}",Image
2986,PG12695,String Quartet No. 04 in C minor Opus 18,"Beethoven, Ludwig van",1770.0,1827.0,['en'],11,"{'Music', 'String quartets -- Scores'}",Image
3412,PG13078,String Quartet No. 12 in E flat major Opus 127,"Beethoven, Ludwig van",1770.0,1827.0,['en'],8,"{'String quartets -- Scores', 'Music'}",Image
3413,PG13079,String Quartet No. 14 in C-sharp minor Opus 131,"Beethoven, Ludwig van",1770.0,1827.0,['en'],14,"{'String quartets -- Scores', 'Music'}",Image
3495,PG13153,String Quartet No. 15 in A minor Opus 132,"Beethoven, Ludwig van",1770.0,1827.0,['en'],36,"{'String quartets -- Scores', 'Music'}",Image
3850,PG13473,String Quartet No. 06 in B flat major Opus 18,"Beethoven, Ludwig van",1770.0,1827.0,['en'],7,"{'Music', 'String quartets -- Scores'}",Image


In [10]:
library.loc[(~library['id'].isin(books_list)) & (library['type'] == 'StillImage')]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
1661,PG114,The Tenniel Illustrations for Carroll's Alice ...,"Tenniel, John",1820.0,1914.0,['en'],391,"{""Children's stories"", 'Fantasy fiction'}",StillImage
15515,PG239,Radar Map of the United States,United States,,,['en'],27,{'United States -- Maps'},StillImage
67797,PG758,"LandSat Picture of Washington, DC",United States. National Aeronautics and Space ...,,,['en'],36,{'Washington (D.C.) -- Remote-sensing images'},StillImage


And finally, Collection contains 'Project Gutenberg DVD: The July 2006 Special' and the only not downloaded text is just empty.

In [11]:
library.loc[(~library['id'].isin(books_list)) & (library['type'] == 'Collection')]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
10150,PG19159,Project Gutenberg DVD: The July 2006 Special,,,,['en'],73,set(),Collection


In [12]:
library.loc[(~library['id'].isin(books_list)) & (library['type'] == 'Text')]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
69464,PG90907,,,,,['en'],1,set(),Text


In [13]:
library.loc[library['author'].str.find('Lovecraft') > -1]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
22880,PG30637,"Writings in the United Amateur, 1915-1922","Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],331,"{'Periodicals', 'Literature -- History and cri...",
23804,PG31469,The Shunned House,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],406,"{'Haunted houses -- Fiction', 'Horror tales, A...",
44538,PG50133,The Dunwich Horror,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],789,"{'American fiction -- 20th century', 'Fantasy ...",
64643,PG68236,The colour out of space,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],572,"{'Extraterrestrial beings -- Fiction', 'Horror...",
64695,PG68283,The call of Cthulhu,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],2045,"{'Cthulhu (Fictitious character) -- Fiction', ...",
64987,PG68547,He,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],187,"{'New York (N.Y.) -- Fiction', 'Horror tales',...",
64994,PG68553,The festival,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],247,"{'Horror tales', 'New England -- Fiction', 'Sh...",
67130,PG70478,The silver key,"Lovecraft, H.P.",,,['en'],0,set(),
67139,PG70486,The lurking fear,"Lovecraft, H. P. (Howard Phillips)",1890.0,1937.0,['en'],1169,"{'Horror tales', 'Catskill Mountains Region (N...",


## Sorting out English books

Starting with 70449 "books" in the catalogue, first select all the texts in the library that are marked as being in english ('en').

That reduces the library to 56954 books.

In [14]:
library_en = library.loc[library['language'].str.find('en') > -1]

Further, for right now I'm also going to drop all of the additional files that were not downloaded:
- NaN 1991
- Sound 1039
- Dataset 83
- Image 33
- MovingImage 7
- StillImage 3
- Collection 1
- Text 1

That additionally reduces the library to 53796 books.

In [15]:
library_en = library_en.loc[library_en['id'].isin(books_list)]

Finally, deleting some files accompanying sound, dataset, etc.. that got downloaded. Only 20 total.

In [16]:
library_en = library_en[library_en['type'].isna()]

Final count of books to potentially use is 53776!

## Beginning Exploration of Authors

There are 6345 books with 10 or less downloads. Something to possibly consider removing.

In [17]:
library_en[library_en['downloads'] <= 10]

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
53,PG10049,Old Lady Mary: A Story of the Seen and the Unseen,"Oliphant, Mrs. (Margaret)",1828.0,1897.0,['en'],10,"{'Death -- Fiction', 'Inheritance and successi...",
355,PG10320,Dotty Dimple at Play,"May, Sophie",1833.0,1906.0,['en'],10,{'Children -- Conduct of life -- Juvenile fict...,
364,PG10329,Snubby Nose and Tippy Toes,"Smith, Laura Rountree",1876.0,1924.0,['en'],7,"{'Rabbits -- Juvenile fiction', 'Animals -- Ju...",
438,PG10396,"Andy the Acrobat: Or, Out with the Greatest Sh...","Harkness, Peter T.",,,['en'],10,{'Circus -- Juvenile fiction'},
486,PG10439,From Yauco to Las Marias: A recent campaign in...,"Herrman, Karl Stephen",,,['en'],10,"{'Spanish-American War, 1898 -- Regimental his...",
...,...,...,...,...,...,...,...,...,...
69600,PG9237,A Bell's Biography,"Hawthorne, Nathaniel",1804.0,1864.0,['en'],8,{'Bells'},
69606,PG9242,"Old Ticonderoga, a Picture of the Past: (From:...","Hawthorne, Nathaniel",1804.0,1864.0,['en'],7,{'New England -- Social life and customs -- Fi...,
69608,PG9244,"Little Daffydowndilly: (From: ""The Snow Image ...","Hawthorne, Nathaniel",1804.0,1864.0,['en'],10,{'New England -- Social life and customs -- Fi...,
69706,PG9332,"Georgie's Present, or, Tales of Newfoundland","Brightwell, C. L. (Cecilia Lucy)",1811.0,1875.0,['en'],9,{'Newfoundland and Labrador -- Fiction'},


Grouping by author, I noticed that there's 117 titles by "unknown" author, 601 "anonymous" and 3422 "various". Upon further inspection of Various, they are majority "periodicals", meaning various magazines and so I decided it was safe to remove that.

As for unknown and anonymous, those might be interesting to check once I have a model, but as is, since I'm looking for writing style, I do want to know who the author is. (lol at Happy and Gay Marching Away - children's poetry by Unknown author).

In [18]:
library_en.groupby('author')[['title']].count().sort_values(by='title', ascending=False).head(20)

Unnamed: 0_level_0,title
author,Unnamed: 1_level_1
Various,3422
Anonymous,601
"Shakespeare, William",178
"Ebers, Georg",163
"Parker, Gilbert",132
"Oliphant, Mrs. (Margaret)",132
"Kingston, William Henry Giles",132
"Twain, Mark",128
"Fenn, George Manville",128
Unknown,117


In [19]:
library_en[library_en['author'] == 'Various']['subjects'].value_counts().head(20)

{'English wit and humor -- Periodicals'}                                                                                            550
{'Periodicals'}                                                                                                                     233
{'Questions and answers -- Periodicals'}                                                                                            220
{'Popular literature -- Great Britain -- Periodicals'}                                                                              195
{"Children's periodicals, American"}                                                                                                162
{'Congregational churches -- Missions -- Periodicals', 'Home missions -- Periodicals'}                                              145
{'Encyclopedias and dictionaries'}                                                                                                  136
{'American periodicals'}                        

Below, keeping authors that are not Anonymous, Unknown or Various, which cuts down to 49636 books.

In [20]:
library_en = library_en[~library_en['author'].isin(['Anonymous', 'Unknown', 'Various'])]

For starters, I'll just look at top 6 authors (based on # of books).

In [21]:
top_six_authors_list = library_en.groupby('author')['title'].count().sort_values(ascending=False).head(6).index.to_list()

In [22]:
library_top_six = library_en[library_en['author'].isin(top_six_authors_list)]

## Reading and Tokenizing Books

Function that opens files and extracts the text (leaving the Gutenberg info at top and bottom out).

In [23]:
def import_book(filepath):
    
    with open(filepath, encoding = 'utf-8') as fi:
        book = fi.read()

    if(not re.search('\*\*\* START OF .+? \*\*\*', book)):
        book_start = 0
    else:
        book_start = re.search('\*\*\* START OF .+? \*\*\*', book).end()

    if(not re.search('\*\*\* END OF .+? \*\*\*', book)):
        book_end = -1
    else:
        book_end = re.search('\*\*\* END OF .+? \*\*\*', book).start()

    book = book[book_start : book_end]

    return book

Process the top 6 authors books;

> NOTE: here I found out that some files have strange characters and won't be opened. I wil lhave to decide what to do with those.

In [24]:
for book_id in library_top_six['id']:
    
    filepath = f'../data/raw/{book_id}_raw.txt'

    try:
        book = import_book(filepath)

        library_top_six.loc[library_top_six['id'] == book_id, 'book_content'] = book
        
    except:
        print('could not open', filepath)

        library_top_six.loc[library_top_six['id'] == book_id, 'book_content'] = 'could not open'

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  library_top_six.loc[library_top_six['id'] == book_id, 'book_content'] = book


could not open ../data/raw/PG3184_raw.txt
could not open ../data/raw/PG3185_raw.txt
could not open ../data/raw/PG6191_raw.txt


In [25]:
# there are some books who had weird characters and were not able to open
library_unopened_books = library_top_six.loc[library_top_six['book_content'] == 'could not open']

library_top_six = library_top_six.loc[library_top_six['book_content'] != 'could not open']

In [26]:
# to see more in a column of df
# pd.set_option('display.max_colwidth', 50)

In [28]:
top_six_authors_dict = {top_six_authors_list[i]: i for i in range(6)}

top_six_authors_dict

library_top_six = library_top_six.replace({'author': top_six_authors_dict})

{'Shakespeare, William': 0,
 'Ebers, Georg': 1,
 'Parker, Gilbert': 2,
 'Kingston, William Henry Giles': 3,
 'Oliphant, Mrs. (Margaret)': 4,
 'Twain, Mark': 5}

In [29]:
X = library_top_six[['book_content']]
y = library_top_six['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [49]:
pipe_vect_logreg = Pipeline(
    steps = [
        #('vect', CountVectorizer(min_df=2, max_df=0.8)),
        ('vect', TfidfVectorizer(min_df=2, max_df=0.8, ngram_range=(1,2))),
        ('logreg', LogisticRegression(max_iter = 10000))
    ]
)

In [50]:
pipe_vect_logreg.fit(X_train['book_content'], y_train)
y_pred = pipe_vect_logreg.predict(X_test['book_content'])

print(accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

0.9722222222222222
[[45  0  0  0  0  0]
 [ 0 41  0  0  0  0]
 [ 1  0 31  0  1  0]
 [ 0  0  0 33  0  0]
 [ 0  0  0  0 32  1]
 [ 3  0  0  0  0 28]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        45
           1       1.00      1.00      1.00        41
           2       1.00      0.94      0.97        33
           3       1.00      1.00      1.00        33
           4       0.97      0.97      0.97        33
           5       0.97      0.90      0.93        31

    accuracy                           0.97       216
   macro avg       0.98      0.97      0.97       216
weighted avg       0.97      0.97      0.97       216



In [53]:
fig = px.imshow(confusion_matrix(y_test, y_pred),
                width=1000,
                height=800,
                text_auto=True,
                labels=dict(x="Predicted Label",
                            y="True Label"),
                            x=tuple(top_six_authors_dict.keys()),
                            y=tuple(top_six_authors_dict.keys()),
                            color_continuous_scale='Teal'
                            )

fig.update(layout_coloraxis_showscale=False)

fig.show()