In [90]:
# reading files
import os

# for punc list
import string

# for regex 
import regex as re

Contrived NLP example:

* ~read in raw text files from a directory with other stuff in it~ 
    * ~jane austen's books~
* tokenize
    * words
    * sentences
* clean
    * remove punctuation
    * remove numbers
    * remove stopwords (from pre-baked list)
    * use regex for something
* create features
    * unigrams
    * bigrams
    * lexicon-based
* pass through classification model
* calculating important metrics
    * most frequent words
    * tf-idf
    * most mentioned entities
    * simple sentiment analysis

In [32]:
## Read in text files
def read_austen_data(directory_name):
    texts = []
    files = [f for f in listdir(directory_name) if isfile(join(directory_name, f))]
    
    for file in files:
        if ("austen" in file):
            with open(directory_name + file, encoding="utf-8") as f:
                #lowercase text and append
                texts.append(f.read().lower())
    
    return texts

In [33]:
austen_books = read_austen_data("data/")

print(len(austen_books))

7


In [178]:
def manual_tokenize(text):
    
    # replace new lines
    text = text.replace("\n", " ")

    # remove punctuation here.
    punc_list = [p for p in string.punctuation]
    
    text = ''.join([char for char in text if char not in punc_list])
            
    # we need to strip all the extraneous spaces (more than 2)
    text = re.sub('\s{2,}', ' ', text)
    
    # remove volume numbers
    text = re.sub("volume i{1,}|volume [0-9]{1,}|volume one|volume two|volume three", "", text)
    
    # listify, splitting on spaces
    text = text.split(' ')

    # the text starts as soon as we find "chapter 1" or "chapter i"
    # so let's move the book to just its relevant parts by finding and deleting content before the first chapter 1|i
    # since the volume-paradigm has multiple ch1, ch2, and so on
    # first, find chapter 1, and replace everything prior
    
    # then, convert back to string, replace all the chapters, then convert back to list
    for i in range(len(text) - 1):
        window_val = ' '.join(text[i:i+2])
        if (window_val == "chapter 1" or window_val == "chapter i"):
            text = text[i+2:]
            break
    
    # back to string
    text = ' '.join(text)
        
    # replace chapters
    text = re.sub("chapter [a-z]+|chapter [0-9]+", "", text)
    
    # back to list
    text = text.split(" ")
    
    return text

In [179]:
# iteratively call the cleaning function
austen_books_tokenized = [manual_tokenize(book) for book in austen_books]

In [191]:
# first ten words of austen's books
print([book[:10] for book in austen_books_tokenized])

[['no', 'one', 'who', 'had', 'ever', 'seen', 'catherine', 'morland', 'in', 'her'], ['sir', 'walter', 'elliot', 'of', 'kellynchhall', 'in', 'somersetshire', 'was', 'a', 'man'], ['about', 'thirty', 'years', 'ago', 'miss', 'maria', 'ward', 'of', 'huntingdon', 'with'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home'], ['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man'], ['the', 'family', 'of', 'dashwood', 'had', 'been', 'long', 'settled', 'in', 'sussex'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home']]


In [118]:
## Tokenizing/Cleaning practice

## manually

## remove new line chars

## separate into words

## remove punctuation

## remove numbers

In [None]:
## w/ sklearn...

## remove new line chars

## separate into sentences

## remove punctuation

## remove numbers

In [1]:
## Cleaning practice

## stopwords (contrived, don't save)

## regex to find and kill volume numbers (save)