## Preprocessing

Explore how to get the data into a more suitable format for training.

In [None]:
import projd
import sys
import nltk
import nltk.data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# for importing local code
src_dir = str(Path(projd.cwd_token_dir('notebooks')) / 'src') # $PROJECT_ROOT/src
if src_dir not in sys.path:
    sys.path.append(src_dir)

import config
import load

nltk.download('punkt')
%matplotlib inline
sns.set()


In [None]:
import importlib
importlib.reload(config)

In [None]:
pnp_text = load.load_pride_and_prejudice()
jokes = load.load_jokes()
names = load.load_names()

## Pride and Prejudice

In [None]:
# pnp has 31 header lines before the book title and 366 footer lines after the end of the book.
# remove the project gutenberg header and footer
lines = list(pnp_text.splitlines())[31:-366]
# print the beginning and end of the book
for i, l in enumerate(lines[:10] + lines[-10:]):
    print(i, l)

text = ' '.join(lines)
print(text[:1000])

In [None]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

sentences = sent_detector.tokenize(text.strip())


In [None]:
print('Approximate number of sentences:', len(sentences))
print('Number of words:', len(text.split()))
print('Number of characters:', len(' '.join(text.split())))

In [None]:
for i, s in enumerate(sentences[:100]):
    print(i, s)

In [None]:
pnpdf = pd.DataFrame(sentences, columns=['sentence'])

In [None]:
pnpdf.sentence.str.len()
pnpdf['len'] = pnpdf.sentence.str.len()

In [None]:
def preprocess_pnp_into_sentences():
    pnp_text = load.load_pride_and_prejudice()
    # pnp has 31 header lines before the book title and 366 footer lines after the end of the book.
    # remove the project gutenberg header and footer
    lines = list(pnp_text.splitlines())[31:-366]
    text = ' '.join(lines)
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sent_detector.tokenize(text.strip())
    pnp = pd.DataFrame(sentences, columns=['text'])
    return pnp

## Examine distribution of text lengths in jokes, pnp, and names

The typical way to train an RNN is to feed it texts of the same length (padded as needed)


In [None]:
pnp = preprocess_pnp_into_sentences()

In [None]:
pnp['len'] = pnp.text.str.len()
print('max len', pnp.len.max())
plt.hist(pnp.len, bins=100)
plt.show()
print(pnp.shape)
pnp.head()

In [None]:
jokes['len'] = jokes.body.str.len()
print("max len", jokes.len.max())
plt.hist(jokes.len, bins=100)
plt.show()
print(jokes.shape)
jokes.head()


In [None]:
names['len'] = names.name.str.len()
print("max len", names.len.max())
plt.hist(names.len, bins=100)
plt.show()
print(names.shape)
print(names[names.sex=='male'].shape)
print(names[names.sex=='female'].shape)
names.head()


## How large is each dataset when limited to text of a certain length?

In [None]:
for maxlen in [10000, 1000, 400, 200, 100]:
    print('maxlen', maxlen)
    print('names:', names[names.len < maxlen].shape[0])
    print('jokes:', jokes[jokes.len < maxlen].shape[0])
    print('pnp:', pnp[pnp.len < maxlen].shape[0])
    print()
    

## Combine and Split Datasets

For each dataset:
  create one long string
  lowercase it
  divide the string into overlapping substrings of length seqlen, with a stride of s.
  

In [None]:
def text_to_sequences(text):
    '''
    normalize text by first lowercasing it and then splitting
    it on text on whitespace and recombine the tokens using a 
    single space char.
    
    divide the normalized text into sequences of length config.seq_len by 
    striding it in strides of length stride_len and taking the subsequence 
    of length seq_len at that position.

    return: list of sequences
    '''
    normalized_text = ' '.join(text.split()).lower()
    sequences = [normalized_text[i:(i + config.seq_len)] 
                 for i in range(0, len(normalized_text), config.stride_len)]
    return sequences

def preprocess_pnp2():
    '''
    return: dataframe with 'text' column and 'category' column.
    '''
    pnp_text = load.load_pride_and_prejudice()
    # pnp has 31 header lines before the book title and 366 footer lines after the end of the book.
    # remove the project gutenberg header and footer
    text = ' '.join(' '.join(list(pnp_text.splitlines())[31:-366]).lower().split())
    print('num chars:', len(text))
    sequences = text_to_sequences(text)
    pnp = pd.DataFrame(sequences, columns=['text'])
    pnp['category'] = 'pride'
    return pnp


In [None]:
pnp = preprocess_pnp2()
pnp.text.iloc[:10].apply(repr)
pnp.head()
pnp.shape

In [None]:
def preprocess_jokes():
    '''
    return: dataframe with 'text' column and 'category' column.
    '''
    jokes = load.load_jokes()
    text = ' '.join(' '.join(jokes.body).lower().split())
    print('num chars:', len(text))
    sequences = text_to_sequences(text)
    df = pd.DataFrame(sequences, columns=['text'])
    df['category'] = 'jokes'
    return df


In [None]:
jokes = preprocess_jokes()
jokes.text.iloc[:10].apply(repr)
jokes.head()
jokes.shape

In [None]:
def preprocess_names():
    '''
    return: dataframe with 'text' column and 'category' column.
    '''
    names = load.load_names()
    male_text = ' '.join(' '.join(names[names.sex == 'male'].name).lower().split())
    print('num chars male_text:', len(male_text))
    male_sequences = text_to_sequences(male_text)
    male_df = pd.DataFrame(male_sequences, columns=['text'])
    male_df['category'] = 'male_names'
    female_text = ' '.join(' '.join(names[names.sex == 'female'].name).lower().split())
    print('num chars female_text:', len(female_text))
    female_sequences = text_to_sequences(female_text)
    female_df = pd.DataFrame(female_sequences, columns=['text'])
    female_df['category'] = 'female_names'
    print('num_chars:', len(male_text) + len(female_text))
    return pd.concat([female_df, male_df], ignore_index=True)


In [None]:
names = preprocess_names()
names.text.iloc[:10].apply(repr)
names.head()
names[names.category == 'male_names'].head()
names.shape