# Metaindex cleaning

This is an itermediate notebook. Full cleaning is at utils/dataset.py

In [1]:
import utils.preprocessing 
import numpy as np

In [2]:
df = utils.preprocessing.get_clean_dataframe()

In [3]:
# proper typing
df = df.astype({"author": "string", "language": "object", "authoryearofbirth": "float32", "downloads": "int32"})
# sort by gut_id
df.sort_index(inplace=True)

# We need to drop non-english titles

In [4]:
df['is_english'] = df.language.apply(lambda x: str(x)).astype('string') == "['en']"
non_english_idx = df[df['is_english'] == False].index
df = df.drop(non_english_idx)

# We need to drop 'Various', 'Unknown', 'Anonymous' authors

In [5]:
AUTHORS_TO_DROP = ['Various', 'Unknown', 'Anonymous']

for a in AUTHORS_TO_DROP:
    idx_to_drop = (df[df.author == a]).index
    df.drop(idx_to_drop, inplace=True)

# Drop all non-text entries (e.g. audio)

In [6]:
df = df.drop(df[df.type != 'Text'].index)

In [7]:
df.author.value_counts()[:10]

Lytton, Edward Bulwer Lytton, Baron    216
Shakespeare, William                   180
Ebers, Georg                           164
Twain, Mark                            152
Parker, Gilbert                        134
Kingston, William Henry Giles          133
Fenn, George Manville                  128
Balzac, Honoré de                      122
Meredith, George                       111
Jacobs, W. W. (William Wymark)         110
Name: author, dtype: Int64

In [8]:
df.loc[[1, 1112, 1213, 7849, 10001, 22222, 33331,],:]

Unnamed: 0_level_0,author,title,downloads,formats,type,LCC,subjects,authoryearofbirth,authoryearofdeath,language,is_english
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,"Jefferson, Thomas",The Declaration of Independence of the United ...,446,{'application/epub+zip': 'https://www.gutenber...,Text,"{JK, E201}","{United States -- History -- Revolution, 1775-...",1743.0,1826.0,[en],True
1112,"Shakespeare, William",The Tragedy of Romeo and Juliet,3526,{'text/plain; charset=us-ascii': 'https://www....,Text,{PR},"{Juliet (Fictitious character) -- Drama, Veron...",1564.0,1616.0,[en],True
1213,"Twain, Mark",The Man That Corrupted Hadleyburg,697,{'application/x-mobipocket-ebook': 'https://ww...,Text,{PS},{United States -- Social life and customs -- 1...,1835.0,1910.0,[en],True
7849,"Kafka, Franz",The Trial,2850,{'application/epub+zip': 'https://www.gutenber...,Text,{PT},{Social problems -- Fiction},1883.0,1924.0,[en],True
10001,"Seneca, Lucius Annaeus",Apocolocyntosis,379,{'application/epub+zip': 'https://www.gutenber...,Text,{PA},"{Claudius, Emperor of Rome, 10 B.C.-54 A.D. --...",,65.0,[en],True
22222,"Crowther, Mary Owens",How to Write Letters (Formerly The Book of Let...,292,{'text/plain; charset=iso-8859-1': 'http://www...,Text,{PE},"{Etiquette, Letter writing}",,,[en],True
33331,"Harris, Joseph Theodore",An Example of Communal Currency: The facts abo...,7,{'text/html; charset=iso-8859-1': 'http://www....,Text,{HG},"{Finance -- Guernsey, Paper money -- Guernsey}",,,[en],True


# We still have 43k books to choose from!

In [9]:
df.type.value_counts()

Text    43174
Name: type, dtype: Int64