In [1]:
import numpy as np
import pandas as pd
import os, sys
import glob

from collections import Counter
import matplotlib.pyplot as plt

In [2]:
repos = os.path.join(os.getcwd(), os.pardir, os.pardir)
gutenberg_repo_path = os.path.join(repos, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos, 'gutenberg-analysis')

In [3]:
## import internal helper functions
analysis_src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(analysis_src_dir)
from data_io import get_book

gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query
from jsd import jsdalpha

# Paths etc. you should set

In [4]:
# If True, uses the static database names, if false, uses the ones from the gutenberg repo
USE_STATIC_DB=True

In [5]:
GUTENBERG_STATIC_DATABASE = '/Users/dean/Documents/GradSchool/TheoryOfMachineLearning/gutenberg_static_database'

## You probably don't need to change this

In [6]:
if USE_STATIC_DB is True:
    metadata_filepath = os.path.join(GUTENBERG_STATIC_DATABASE, 'SPGC-metadata-2018-07-18.csv')
    filter_exist=False
else:
    metadata_filepath = os.path.join(gutenberg_repo_path,'metadata','metadata.csv')
    filter_exist=True

# Load the Metadata

In [7]:
mq = meta_query(path=metadata_filepath, filter_exist=filter_exist)

mq.df.head()



Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,language_set
0,PG0,,,,,,,set(),Text,
1,PG1,The Declaration of Independence of the United ...,"Jefferson, Thomas",1743.0,1826.0,['en'],604.0,"{'United States -- History -- Revolution, 1775...",Text,{en}
2,PG2,The United States Bill of Rights: The Ten Orig...,United States,,,['en'],158.0,"{'Civil rights -- United States -- Sources', '...",Text,{en}
3,PG3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,['en'],28.0,{'Presidents -- United States -- Inaugural add...,Text,{en}
4,PG4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",1809.0,1865.0,['en'],55.0,{'Consecration of cemeteries -- Pennsylvania -...,Text,{en}


## Let's add line counts

In [8]:
if not USE_STATIC_DB:
    mq.add_line_count()

# Distribution of Languages

In [9]:
# Note: Built-in method doesn't properly handle multiple languages
mq.get_lang_counts()

Counter({'en': 47120,
         'fr': 2892,
         'fi': 1903,
         'de': 1680,
         'nl': 787,
         'it': 724,
         'es': 601,
         'pt': 550,
         'zh': 441,
         'el': 220,
         'sv': 186,
         'hu': 183,
         'eo': 118,
         'la': 116,
         'da': 68,
         'tl': 60,
         'ca': 33,
         'pl': 31,
         'ja': 22,
         'no': 19,
         'cy': 12,
         'cs': 10,
         'ru': 9,
         'is': 7,
         'fur': 7,
         'bg': 6,
         'he': 6,
         'enm': 6,
         'te': 6,
         'ang': 4,
         'sr': 4,
         'af': 4,
         'nai': 3,
         'nah': 3,
         'ilo': 3,
         'ceb': 3,
         'grc': 3,
         'ro': 2,
         'myn': 2,
         'ga': 2,
         'fy': 2,
         'mi': 2,
         'nav': 2,
         'arp': 2,
         'gla': 2,
         'brx': 2,
         'ko': 1,
         'sa': 1,
         'ale': 1,
         'yi': 1,
         'lt': 1,
         'kha': 1,
        

## Lets see how many have more than one language

In [10]:
# Lets figure out how many books have more than one language

more_than_one_lang = [(lang, len(lang), count) for lang, count in mq.df['language_set'].value_counts().items() if len(lang) > 1]
lang_df = pd.DataFrame(more_than_one_lang, columns = ['Languages', 'Num Languages', 'Num Books'])
lang_df

Unnamed: 0,Languages,Num Languages,Num Books
0,"{de, en}",2,29
1,"{en, la}",2,23
2,"{en, eo}",2,19
3,"{en, es}",2,16
4,"{en, fr}",2,16
5,"{en, zh}",2,7
6,"{enm, en}",2,3
7,"{en, cy}",2,3
8,"{en, ang}",2,3
9,"{en, it}",2,3


In [11]:
lang_df['Num Languages'].value_counts()

Num Languages
2    49
3     2
Name: count, dtype: int64

In [12]:
lang_df.sort_values('Num Books', ascending=False)[['Languages', 'Num Books']]

Unnamed: 0,Languages,Num Books
0,"{de, en}",29
1,"{en, la}",23
2,"{en, eo}",19
3,"{en, es}",16
4,"{en, fr}",16
5,"{en, zh}",7
6,"{enm, en}",3
7,"{en, cy}",3
8,"{en, ang}",3
9,"{en, it}",3


What are the language combinations with more than 2?

In [13]:
lang_df[lang_df['Num Languages']>2]

Unnamed: 0,Languages,Num Languages,Num Books
15,"{en, es, fr}",3,2
17,"{en, es, tl}",3,2


## Lets look only at books with one language

In [14]:
one_lang = [(lang, count) for lang, count in mq.df['language_set'].value_counts().items() if len(lang) == 1]
one_lang_df = pd.DataFrame(one_lang, columns=['language', 'numBooks'])
one_lang_df

Unnamed: 0,language,numBooks
0,{en},46972
1,{fr},2864
2,{fi},1903
3,{de},1644
4,{nl},782
5,{it},720
6,{es},577
7,{pt},548
8,{zh},434
9,{el},216
