In [None]:
import nltk
from nltk.tag import pos_tag, pos_tag_sents
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import LancasterStemmer, SnowballStemmer
import spacy
from spacy.lemmatizer import Lemmatizer
import re
import pandas as pd
import pickle as pkl
import glob
%matplotlib inline

In [None]:
def preprocess_sent(book, num_sentences=20):
    
    """
    Takes in a book and preprocesses it so that it is ready to be vectorized and fed into a model. 
    Proper nouns are removed, along with punctuation, numbers, extra spaces.
    The output is a list of lists containing the specified number of sentences each. 
    
    Parameters:
    book (str): an entire book in string format
    num_sentences (int): number of sentences you want to group together
    cpus (int): number of cpus to use for multiprocessing
    
    Returns:
    processed_book (list): lists of sentences (default is 20) fully pre-processed
    """
    
    processed_book = re.sub('\\n', ' ', book)    # substitute all newline characters with spaces
    processed_book = word_tokenize(processed_book)    # word tokenize to identify proper nouns
    # remove proper nouns and join back up into one string
    processed_book = ' '.join([word for word,tag in pos_tag(processed_book) if not tag == 'NNP' or tag == 'NNPS'])
    processed_book = sent_tokenize(processed_book)    # sentence tokenize 
    processed_book = [sentence.lower() for sentence in processed_book]    # make everything lowercase
    processed_book = [re.sub('[^a-zA-Z]', ' ', sentence) for sentence in processed_book]    # take out all punctuation
    processed_book = [re.sub('\w*\d\w*', ' ', sentence) for sentence in processed_book]    # take out all numbers
    processed_book = [re.sub('[  ]+', ' ', sentence) for sentence in processed_book]    # remove extra spaces
    #processed_book = stem_words(processed_book)
    processed_book = lemmatize_words(processed_book)
    processed_book = [' '.join(processed_book[i:i+num_sentences]) for i in range(0, len(processed_book), num_sentences)]   # make lists of 20 sentences 
    processed_book = [re.sub('[  ]+', ' ', sentence) for sentence in processed_book]    # remove extra spaces
    return processed_book

In [None]:
def stem_words(book, method='snowball'):
    
    """
    Stems words using Lancaster or Snowball stemming methods from the NLTK library.
    
    Parameters:
    book (list): sentence-tokenized book
    method (str): 'lancaster' or 'snowball' stemming styles
    
    Returns:
    stemmed_book (list): sentences fully stemmed
    """
    
    stemmed_book = book.copy()
    if method == 'lancaster':
        stemmer = LancasterStemmer()
    else:
        stemmer = SnowballStemmer('english')
        
    for i, sentence in enumerate(stemmed_book):
        sentence_words = nltk.word_tokenize(sentence)
        for j, word in enumerate(sentence_words):
            sentence_words[j] = stemmer.stem(word)
        stemmed_book[i] = ' '.join(sentence_words)
    
    return stemmed_book

In [None]:
def lemmatize_words(book):
        
    """
    Lemmatizes words using spaCy.
    
    Parameters:
    book (list): sentence-tokenized book
    method (str): 'lancaster' or 'snowball' stemming styles
    
    Returns:
    stemmed_book (list): sentences fully lemmatized
    """
    
    
    lemmatized_book = book.copy()    
    sp = spacy.load('en_core_web_sm')

    for ix, sentence in enumerate(lemmatized_book):
        sp_sent = []
        for word in sp(sentence):
            sp_sent.append(word.lemma_)
        lemmatized_book[ix] = ' '.join(sp_sent)
        
    return lemmatized_book

## Read Books into Python

In [None]:
# full texts of male-authored books
male_books = []
# book file names, in the form "book-title_book-author.txt"
male_book_list = []
# specify file path to male-authored books
path = "/Users/winstonma4/Metis/project4/books/male/"

for file in glob.glob(path + "*.txt"):
    male_book_list.append(file)
    with open(file) as f:
        male_books.append(f.read())

In [None]:
# full texts of female-authored books
female_books = []
# book file names, in the form "book-title_book-author.txt"
female_book_list = []
# specify file path to female-authored books
path = "/Users/winstonma4/Metis/project4/books/female/"

for file in glob.glob(path + "*.txt"):
    female_book_list.append(file)
    with open(file) as f:
        female_books.append(f.read())

## Preprocess and Split Books into Pages (20 Sentences)

In [None]:
# fully-preprocessed texts of male-authored books
male_pages = []
# DataFrame containing documents with corresponding book title and author 
male_pages_df = pd.DataFrame(columns=['documents', 'book', 'author'])

for ix, book in enumerate(male_books):
    temp_book = preprocess_sent(book)
    male_pages.extend(temp_book)
    male_pages_df = pd.concat([male_pages_df, pd.DataFrame({'documents': temp_book, 
                  'book':male_book_list[ix].split('_')[0],    
                  'author':male_book_list[ix].split('_')[1]})])
male_pages_df['author_gender'] = 'male'

In [None]:
# fully-preprocessed texts of female-authored books
female_pages = []
# DataFrame containing documents with corresponding book title and author
female_pages_df = pd.DataFrame(columns=['documents', 'book', 'author'])
for ix, book in enumerate(female_books):
    temp_book = preprocess_sent(book)
    female_pages.extend(temp_book)
    female_pages_df = pd.concat([female_pages_df, pd.DataFrame({'documents': temp_book, 
                  'book':female_book_list[ix].split('_')[0], 
                  'author':female_book_list[ix].split('_')[1]})])
female_pages_df['author_gender'] = 'female'

In [None]:
# master DataFrame with all books, titles, authors, and author genders
df = pd.concat([male_pages_df, female_pages_df])

In [None]:
pd.to_pickle(df, 'all_books_df.pkl')