# Possible Writing Date Distribution

In [1]:
import numpy as np
import pandas as pd
import os, sys
import glob

from collections import Counter
import matplotlib.pyplot as plt

In [22]:
repos = os.path.join(os.getcwd(), os.pardir, os.pardir)
gutenberg_repo_path = os.path.join(repos, 'gutenberg')
gutenberg_analysis_repo = os.path.join(repos, 'gutenberg-analysis')
corpus_analysis_repo = os.path.join(os.getcwd(), os.pardir)

In [23]:
## import internal helper functions
analysis_src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(analysis_src_dir)
from data_io import get_book

gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query
from jsd import jsdalpha

sys.path.append(corpus_analysis_repo)
from misc_utils import setup_graph_params

# Paths etc. you should set

In [4]:
# If True, uses the static database names, if false, uses the ones from the gutenberg repo
USE_STATIC_DB=True

In [5]:
GUTENBERG_STATIC_DATABASE = '/Users/dean/Documents/GradSchool/TheoryOfMachineLearning/gutenberg_static_database'

## You probably don't need to change this

In [6]:
# gutenberg_info_fold will be whatever the equivalent to your top level directory of the repo is
# If using the static dataset, you will have to put it in that structure

if USE_STATIC_DB is True:
    gutenberg_info_fold = GUTENBERG_STATIC_DATABASE
    filter_exist = False
else:
    gutenberg_info_fold = gutenberg_repo_path
    filter_exist=True
    
metadata_filepath = os.path.join(gutenberg_info_fold, 'metadata', 'metadata.csv')

# Load the Metadata

In [12]:
mq = meta_query(path=metadata_filepath, filter_exist=filter_exist)



## Let's add line counts

In [13]:
if not USE_STATIC_DB:
    mq.add_line_count()

# Possible Creation Year

Since there is no data on the year a book was written, we will use author birth as a proxy.

In [14]:
## get all records that have information on yearofauthorbirth and yearof authordeath
mq.reset()
mq.filter_year([-10000,10000])

mq.df.head()


Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,language_set
1,PG1,The Declaration of Independence of the United ...,"Jefferson, Thomas",1743.0,1826.0,['en'],604.0,"{'United States -- History -- Revolution, 1775...",Text,{en}
3,PG3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,['en'],28.0,{'Presidents -- United States -- Inaugural add...,Text,{en}
4,PG4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",1809.0,1865.0,['en'],55.0,{'Consecration of cemeteries -- Pennsylvania -...,Text,{en}
6,PG6,Give Me Liberty or Give Me Death,"Henry, Patrick",1736.0,1799.0,['en'],54.0,{'United States -- Politics and government -- ...,Text,{en}
8,PG8,Abraham Lincoln's Second Inaugural Address,"Lincoln, Abraham",1809.0,1865.0,['en'],25.0,{'United States -- Politics and government -- ...,Text,{en}


## Lets figure out what years a book could have been written
Code below taken from the authors of the paper

In [33]:
hmin = 16 ## consider minimum age to write a book as 16
list_tmin = np.array(mq.df['authoryearofbirth'])+hmin
list_tmax = np.array(mq.df['authoryearofdeath'])

In [34]:
t1 = min(list_tmin)
t2 = max(list_tmax)

arr_t = np.arange(t1,t2+1,1)
print(arr_t)
arr_n = 0*arr_t
for i_t,t in enumerate(arr_t):
    n = len( np.where( (list_tmin<=t)&(list_tmax>=t) )[0] )
    arr_n[i_t] = n

[-734. -733. -732. ... 2013. 2014. 2015.]


In [35]:
###########
## Setup ##
###########
params = setup_graph_params.get_graph_params()
plt.rcParams.update(params)

In [38]:
## note that there are some (few) books B.C.
## --> could be added as inset

x = arr_t
y = arr_n


# f=plt.figure(figsize = (10,20))
f=plt.figure()
ax=f.add_subplot(1,1,1)


ax.plot(x,y,lw=1)
ax.set_xlabel(r'$t$, year')
ax.set_ylabel(r'$N$')
ax.set_title('Possible Year of Writing')

ax.set_xlim(1500,2015)
# ax.set_ylim(1,10**5)

# ax.set_xlim(10**0,10**6)
# ax.set_xscale('log')
# ax.set_yscale('log')
# plt.subplots_adjust(left=0.1,bottom=0.4)

path_save = os.path.join(gutenberg_info_fold,'figures')
os.makedirs(path_save, exist_ok=True)
fname_save = 'date_distribution.png'
filename = os.path.join(path_save,fname_save)
plt.savefig(filename)