This notebook will compare the metadata files created by the SPGC and the pg_catalog.csv from project gutenberg
Then, it will make a dataset

In [None]:
import numpy as np
import pandas as pd
import os, sys
import glob

from collections import Counter
import matplotlib.pyplot as plt

import misc_utils.dataset_filtering as dataset_filtering

In [None]:
git_repo_path = '/Users/dean/Documents/gitRepos'
gutenberg_repo_path = os.path.join(git_repo_path, 'gutenberg')
gutenberg_analysis_repo = os.path.join(git_repo_path, 'gutenberg-analysis')

In [None]:
## import internal helper functions
analysis_src_dir = os.path.join(gutenberg_analysis_repo,'src')
sys.path.append(analysis_src_dir)
from data_io import get_book

gutenberg_src_dir = os.path.join(gutenberg_repo_path,'src')
sys.path.append(gutenberg_src_dir)

from metaquery import meta_query
from jsd import jsdalpha

# Read in both metadata files

In [None]:
mq_filepath=os.path.join(gutenberg_repo_path,'metadata','metadata.csv')
pg_catalog_filepath=os.path.join(git_repo_path, 'gutenberg_corpus_analysis', 'sample_dataset', 'pg_catalog.csv')

# Load both  metadata files

Load both the metadata file generated by SPGC and the metadata file from PG

In [None]:
df = dataset_filtering.read_metadata_and_catalog(mq_filepath, pg_catalog_filepath)
original_shape=df.shape

Get only English books, according to PG catalog

In [None]:
df = df.query('Language=="en"')
df['Language'].unique()

Let's verify that the language column in both metadata files match

In [None]:
df['language'].unique()

Uh oh, it doesn't!  What book is this causing the problems?

In [None]:
df.query('Language=="en" and language=="[\'ne\']"')

Lets get rid of it!

In [None]:
index_to_drop = df.query('Language=="en" and language=="[\'ne\']"').index
df.drop(index_to_drop, inplace=True)

In [None]:
# Verifying that everything is good
print(df['Language'].unique())
print(df['language'].unique())

In [None]:
print(f'Original Shape: {original_shape}')
print(f'Current Shape: {df.shape}')

# Lets get rid of anything missing a title or an author

In [None]:
tdf = df[['title', 'title_pgc', 'author', 'Authors']]
tdf[tdf.isnull().any(axis=1)]

Well, it looks like most of these HAVE authors (or at least editors), it's just messed up in the metadata created by SPGC.  Let's just drop them.

In [None]:
tdf.head()

In [None]:
to_drop = tdf[tdf.isnull().any(axis=1)].index
df.loc[to_drop].head()

In [None]:
df.drop(to_drop, inplace=True)

# Lets see if titles match

In [None]:
dont_match, attribute_errors = dataset_filtering.compare_columns(df, 'title', 'title_pgc', verbose=True)#['author']

Let's get rid of PG63765 and lets note that we should get rid of copyright renewals en masse.

We can also ditch the duplicate column

In [None]:
df = df[df['id']!='PG63765']
df.drop('title_pgc', axis=1, inplace=True)

# Verify Author Matches

In [None]:
dont_match, attribute_errors = dataset_filtering.compare_columns(df, 'author', 'Authors')#['author']

Note that there are actually a bunch that don't match properly, but it appears that it is mostly a formatting issue.  We can come back to it, if needed.  Leaving them here

Lets ditch the duplicate authors column though

In [None]:
df.drop('Authors', axis=1, inplace=True)

# Do IDs match?  They better!

Note: This should be totally unnecessary since we joined on ID

In [None]:
# Create a numeric version of the 'id' column with the "PG" removed
df['id_numeric'] = (
    df['id']
    .str.replace('PG', '')  # remove the literal "PG"
    .astype(str)            # convert to integer
)

df['PG_ID'] = df['PG_ID'].astype(str)              # Convert numeric to string

dont_match, attribute_errors = dataset_filtering.compare_columns(
    df,
    'id_numeric',
    'PG_ID',
    verbose=True
)

In [None]:
print(dont_match)
print(attribute_errors)

No entries show up with unmatching id's. We can drop the placeholder id_numeric column. I keep the redundant PG_ID column here not knowing if it will be useful later to query the raw data. 

# Where do we stand?

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['author'].value_counts()

In [None]:

#fig, ax = plt.subplots()
#df['author'].value_counts().plot(ax=ax, kind='bar')

In [None]:
#df['author'].value_counts().plot(kind='bar')

It will be difficult to categorize "Various", "Anonymous", or "Unknown" authors, let's ditch them

In [None]:
df = df[~df['author'].isin(['Various', 'Anonymous', 'Unknown'])]
df.shape

Lets see how many authors have more than 10 or 20 books

In [None]:
vc = df['author'].value_counts()
vc

In [None]:
print(f'There are a total of {len(vc)} authors')
for book_count in [10, 20, 30, 40, 50, 75, 100]:
    print(f'There are {len(vc[vc > book_count])} authors with more than {book_count} books')


In [None]:
# What should be the minimum number of books per author?
book_count_cutoff=30

In [None]:
authors_to_include = vc[vc > book_count_cutoff].index

mask = df['author'].isin(authors_to_include)
df = df[mask]

In [None]:
df.shape

# Add information on the length of books

Adds the number of lines, the number of words, and the number of unique words

By default, drops the books you haven't downloaded

#### Add the total word count of the entry, called 'word_count'.

In [None]:
def get_word_count(book_id, raw_text_dir):
    """
    Given something like 'PG10007' and a directory containing a file called
    'PG10007_counts.txt' whose lines each have a word and a count, sum up
    all those counts and return the total.
    """
    filename = f"{book_id}_counts.txt"
    file_path = os.path.join(raw_text_dir, filename)

    # If the file doesn’t exist, return None
    if not os.path.exists(file_path):
        return None

    total_count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Each line looks like: word count
            word, count_str = line.strip().split()
            total_count += int(count_str)

    return total_count

raw_text_path = r"C:/Users/Hunter Worssam/Data Science/Theory of ML/Group Project/gutenberg/data/counts" 
df["word_count"] = df["id"].apply(lambda pid: get_word_count(pid, raw_text_path))

#### Add the total unique word count of the entry, called 'unique_word_count'.

In [None]:
def get_unique_word_count(book_id, raw_text_dir):
    """
    Given something like 'PG10007' and a directory containing a file 
    called 'PG10007_counts.txt' whose lines each have 'word count',
    return how many lines that file has (i.e., how many unique words).
    """
    filename = f"{book_id}_counts.txt"
    file_path = os.path.join(raw_text_dir, filename)

    if not os.path.exists(file_path):
        return None

    # Count lines to get # of unique words
    with open(file_path, 'r', encoding='utf-8') as f:
        num_unique_words = sum(1 for _ in f)

    return num_unique_words

raw_text_path = r"C:/Users/Hunter Worssam/Data Science/Theory of ML/Group Project/gutenberg/data/counts"
df["unique_word_count"] = df["id"].apply(
    lambda pid: get_unique_word_count(pid, raw_text_path)
)

#### Add total lines of text in the raw text file, called 'line_count'.

Note that this is taking line count of the somewhat-cleaned files in the text folder, not the files in the raw folder.

In [None]:
def get_line_count(book_id, text_dir):
    """
    Given something like 'PG10007' and a directory containing
    'PG10007_text.txt', return how many lines are in the file.
    """
    filename = f"{book_id}_text.txt"  
    file_path = os.path.join(text_dir, filename)

    # If the file doesn’t exist, return None (or 0)
    if not os.path.exists(file_path):
        return None

    # Count lines
    with open(file_path, 'r', encoding='utf-8') as f:
        line_count = sum(1 for _ in f)

    return line_count

raw_text_path = r"C:/Users/Hunter Worssam/Data Science/Theory of ML/Group Project/gutenberg/data/text"
df["line_count"] = df["id"].apply(
    lambda pid: get_line_count(pid, raw_text_path)
)

#### Add total tokens in the entry, called 'token_count'.

In [None]:
def get_token_count(book_id, text_dir):
    """
    Given something like 'PG10007' and a directory containing
    'PG10007_tokens.txt', return how many lines are in the file.
    """
    filename = f"{book_id}_tokens.txt"  
    file_path = os.path.join(text_dir, filename)

    # If the file doesn’t exist, return None (or 0)
    if not os.path.exists(file_path):
        return None

    # Count lines
    with open(file_path, 'r', encoding='utf-8') as f:
        token_count = sum(1 for _ in f)

    return token_count

raw_text_path = r"C:/Users/Hunter Worssam/Data Science/Theory of ML/Group Project/gutenberg/data/tokens"
df["token_count"] = df["id"].apply(
    lambda pid: get_token_count(pid, raw_text_path)
)

In [None]:
df.shape

# SETTINGS

In [None]:
df['author'].value_counts().min()

In [None]:
#####################
#####################
def normalize_dataset(df, how='num_books'):
    author_list = df['author'].unique()
    vc = df['author'].value_counts()

    min_num_books = vc.min()

    for author in author_list:
        tdf = df.query('author==@author')
        num_to_drop = tdf.shape[0] - min_num_books
        ind_to_drop = tdf.sample(num_to_drop).index
        df.drop(ind_to_drop, inplace=True)
    
def split_test_train(df, train_perc=0.8):
    author_list = df['author'].unique()
    train_ind = []
    for author in author_list:
        tdf = df.query('author==@author')
        single_author_train_ind = tdf.sample(frac=train_perc).index
        train_ind = [*train_ind,*single_author_train_ind]

    train_df = df.loc[train_ind]
    test_df = df.drop(train_ind)

    return train_df, test_df
    

In [None]:
normalize_dataset(df)

In [None]:
df.shape

In [None]:
train_df, test_df = split_test_train(df)

In [None]:
train_df

In [None]:
test_df

In [None]:
def write_csv_in_metadata_format(df, outfile):
    cols_to_keep = ['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects']
    df = df[cols_to_keep]
    df.to_csv(outfile)

In [None]:
train_outfile = 'train.csv'
test_outfile = 'test.csv'
write_csv_in_metadata_format(train_df, train_outfile)
write_csv_in_metadata_format(test_df, test_outfile)