In [193]:
# reading files
import os

# for punc list
import string

# for regex 
import regex as re

# for nltk (library) work
import nltk

Contrived NLP example:

* ~read in raw text files from a directory with other stuff in it~ 
    * ~jane austen's books~
* tokenize
    * ~words (manually)~
    * ~words (nltk)~
    * ~sentences (nltk)~
* clean
    * ~remove punctuation (manually)~
    * ~remove numbers (nltk)~
    * ~remove stopwords (from pre-baked list) (manually)~
    * ~use regex for something (manually)~
* calculating important metrics
    * unigrams
    * bigrams
    * lexicon-based
    * most frequent words
    * tf-idf
    * most mentioned entities
    * simple sentiment analysis

Practical Numpy example:

* make a random array of 1,000 integers between 1 and 10,000
* make an array with 17 0's
* calculate five number summary
* find the standard deviation and the mean
* find the mode

In [32]:
## Read in text files
def read_austen_data(directory_name):
    texts = []
    files = [f for f in listdir(directory_name) if isfile(join(directory_name, f))]
    
    for file in files:
        if ("austen" in file):
            with open(directory_name + file, encoding="utf-8") as f:
                #lowercase text and append
                texts.append(f.read().lower())
    
    return texts

In [33]:
austen_books = read_austen_data("data/")

print(len(austen_books))

7


In [192]:
def manual_tokenize(text):
    
    # replace new lines
    text = text.replace("\n", " ")

    # remove punctuation here.
    punc_list = [p for p in string.punctuation]
    
    text = ''.join([char for char in text if char not in punc_list])
            
    # we need to strip all the extraneous spaces (more than 2)
    text = re.sub('\s{2,}', ' ', text)
    
    # remove volume numbers
    text = re.sub("volume i{1,}|volume [0-9]{1,}|volume one|volume two|volume three", "", text)
    
    # listify, splitting on spaces
    text = text.split(' ')

    # the text starts as soon as we find "chapter 1" or "chapter i"
    # so let's move the book to just its relevant parts by finding and deleting content before the first chapter 1|i
    # since the volume-paradigm has multiple ch1, ch2, and so on
    # first, find chapter 1, and replace everything prior
    
    # then, convert back to string, replace all the chapters, then convert back to list
    for i in range(len(text) - 1):
        window_val = ' '.join(text[i:i+2])
        if (window_val == "chapter 1" or window_val == "chapter i"):
            text = text[i+2:]
            break
    
    # back to string
    text = ' '.join(text)
        
    # replace chapters
    text = re.sub("chapter [a-z]+|chapter [0-9]+", "", text)
    
    # back to list
    text = text.split(" ")
    
    ## method extensions (pseudocode ok)
    ## removing stop words 
        # (word for word in book if word not in stopwords)
    ## removing numbers 
        # (word for word in book if word not in [num_list])
        # or you could use regex across the whole thing as a string then join it back
    
    return text

In [179]:
# iteratively call the cleaning function
austen_books_tokenized = [manual_tokenize(book) for book in austen_books]

In [191]:
# first ten words of austen's books
print([book[:10] for book in austen_books_tokenized])

[['no', 'one', 'who', 'had', 'ever', 'seen', 'catherine', 'morland', 'in', 'her'], ['sir', 'walter', 'elliot', 'of', 'kellynchhall', 'in', 'somersetshire', 'was', 'a', 'man'], ['about', 'thirty', 'years', 'ago', 'miss', 'maria', 'ward', 'of', 'huntingdon', 'with'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home'], ['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man'], ['the', 'family', 'of', 'dashwood', 'had', 'been', 'long', 'settled', 'in', 'sussex'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home']]


In [224]:
## w/ ntlk...

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def tokenize_with_nltk(book, unit):
        
    if (unit == 'word'):
        tokens=nltk.word_tokenize(book)
    
    ## dealing with new line chars can be a little annoying
    elif (unit == 'sentence'): 
        tokens = []
        paragraphs = [p for p in book.split('\n') if p]

        for paragraph in paragraphs:
            if (paragraph != ''):
                tokens.append(sent_detector.tokenize(paragraph.strip()))
        
        tokens = sum(tokens, [])
    
    return tokens

In [225]:
mansfield_park_nltk_sent_tokenized = tokenize_with_nltk(austen_books[0], 'sentence')

## first ten "sentences" of Mansfield park
## note that we have a harder time really "cleaning" up this text
## for large enough corpora of course, this does not matter.
print(mansfield_park_nltk_sent_tokenized[0:10])

['northanger abbey', 'biographical notice of the author', '1', 'the following pages are the production of a pen which has already contributed in no small degree to the entertainment of the public.', "and when the public, which has not been insensible to the merits of 'sense and sensibility,' 'pride and prejudice,' 'mansfield park,' and 'emma,' shall be informed that the hand which guided that pen is now mouldering in the grave, perhaps a brief account of jane austen will be read with a kindlier sentiment than simple curiosity.", 'short and easy will be the task of the mere biographer.', 'a life of usefulness, literature, and religion, was not by any means a life of event.', 'to those who lament their irreparable loss, it is consolatory to think that, as she never deserved disapprobation, so, in the circle of her family and friends, she never met reproof; that her wishes were not only reasonable, but gratified; and that to the little disappointments incidental to human life was never ad

In [226]:
mansfield_park_nltk_word_tokenized = tokenize_with_nltk(austen_books[0], 'word')

print(mansfield_park_nltk_word_tokenized[1000:1050])

['the', 'present', 'age', 'it', 'is', 'hazardous', 'to', 'mention', 'accomplishments', '.', 'our', 'authoress', 'would', ',', 'probably', ',', 'have', 'been', 'inferior', 'to', 'few', 'in', 'such', 'acquirements', ',', 'had', 'she', 'not', 'been', 'so', 'superior', 'to', 'most', 'in', 'higher', 'things', '.', 'she', 'had', 'not', 'only', 'an', 'excellent', 'taste', 'for', 'drawing', ',', 'but', ',', 'in']


remove punctuation - if you really wanted to, you have to do it again manually

alternatively nltk has an api that lets you use your own Regexes as the delimiters but this can elad to other issues

also you need to be careful about the end result here because you might not intend for Don't to split apart into 2 words.
    
remove numbers - similar to above, but less side effects.