In [108]:
# reading files
from os import listdir
from os.path import isfile, join

# for punc list
import string

# for regex 
import regex as re

# for nltk (library) work
import nltk

# for sorting dict
import operator

# for collapsing experiments
import itertools

Contrived NLP example:

* ~read in raw text files from a directory with other stuff in it~ 
    * ~jane austen's books~
* tokenize
    * ~words (manually)~
    * ~words (nltk)~
    * ~sentences (nltk)~
* clean
    * ~remove punctuation (manually)~
    * ~remove numbers (nltk)~
    * ~remove stopwords (from pre-baked list) (manually)~
    * ~use regex for something (manually)~
* calculating important metrics
    * ~most frequent words~
    * ~unigrams~
    * ~bigrams~
    * ~trigrams~
    * tf-idf

Practical Numpy example:

* make a random array of 1,000 integers between 1 and 10,000
* make an array with 17 0's
* calculate five number summary
* find the standard deviation and the mean
* find the mode

In [9]:
## Read in text files
def read_austen_data(directory_name):
    texts = []
    files = [f for f in listdir(directory_name) if isfile(join(directory_name, f))]
    
    for file in files:
        if ("austen" in file):
            with open(directory_name + file, encoding="utf-8") as f:
                #lowercase text and append
                texts.append(f.read().lower())
    
    return texts

In [10]:
austen_books = read_austen_data("data/")

print(len(austen_books))

7


In [165]:
def manual_tokenize(text):
    
    # replace new lines
    text = text.replace("\n", " ")
    
    # special weird punc
    text = text.replace("---", " ")

    # remove punctuation here.
    punc_list = [p for p in string.punctuation]
    
    text = ''.join([char for char in text if char not in punc_list])
            
    # we need to strip all the extraneous spaces (more than 2)
    text = re.sub('\s{2,}', ' ', text)
    
    # remove volume numbers
    text = re.sub("volume i{1,}|volume [0-9]{1,}|volume one|volume two|volume three", "", text)
    
    # listify, splitting on spaces
    text = text.split(' ')

    # the text starts as soon as we find "chapter 1" or "chapter i"
    # so let's move the book to just its relevant parts by finding and deleting content before the first chapter 1|i
    # since the volume-paradigm has multiple ch1, ch2, and so on
    # first, find chapter 1, and replace everything prior
    
    # then, convert back to string, replace all the chapters, then convert back to list
    for i in range(len(text) - 1):
        window_val = ' '.join(text[i:i+2])
        if (window_val == "chapter 1" or window_val == "chapter i"):
            text = text[i+2:]
            break
    
    # back to string
    text = ' '.join(text)
        
    # replace chapters
    text = re.sub("chapter [a-z]+|chapter [0-9]+", "", text)
    
    # back to list
    text = text.split(" ")
    
    # get rid of empties
    text = [word for word in text if word != ""]
    
    ## method extensions (pseudocode ok)
    ## removing stop words 
        # (word for word in book if word not in stopwords)
    ## removing numbers 
        # (word for word in book if word not in [num_list])
        # or you could use regex across the whole thing as a string then join it back
    
    return text

In [166]:
# iteratively call the cleaning function
austen_books_tokenized = [manual_tokenize(book) for book in austen_books]

In [162]:
# first ten words of austen's books
print([book[:10] for book in austen_books_tokenized])

[['no', 'one', 'who', 'had', 'ever', 'seen', 'catherine', 'morland', 'in', 'her'], ['sir', 'walter', 'elliot', 'of', 'kellynchhall', 'in', 'somersetshire', 'was', 'a', 'man'], ['about', 'thirty', 'years', 'ago', 'miss', 'maria', 'ward', 'of', 'huntingdon', 'with'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home'], ['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man'], ['the', 'family', 'of', 'dashwood', 'had', 'been', 'long', 'settled', 'in', 'sussex'], ['emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home']]


In [14]:
## w/ ntlk...

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def tokenize_with_nltk(book, unit):
        
    if (unit == 'word'):
        tokens=nltk.word_tokenize(book)
    
    ## dealing with new line chars can be a little annoying
    elif (unit == 'sentence'): 
        tokens = []
        paragraphs = [p for p in book.split('\n') if p]

        for paragraph in paragraphs:
            if (paragraph != ''):
                tokens.append(sent_detector.tokenize(paragraph.strip()))
        
        tokens = sum(tokens, [])
    
    return tokens

In [15]:
mansfield_park_nltk_sent_tokenized = tokenize_with_nltk(austen_books[0], 'sentence')

## first ten "sentences" of Mansfield park
## note that we have a harder time really "cleaning" up this text
## for large enough corpora of course, this does not matter.
print(mansfield_park_nltk_sent_tokenized[0:10])

['northanger abbey', 'biographical notice of the author', '1', 'the following pages are the production of a pen which has already contributed in no small degree to the entertainment of the public.', "and when the public, which has not been insensible to the merits of 'sense and sensibility,' 'pride and prejudice,' 'mansfield park,' and 'emma,' shall be informed that the hand which guided that pen is now mouldering in the grave, perhaps a brief account of jane austen will be read with a kindlier sentiment than simple curiosity.", 'short and easy will be the task of the mere biographer.', 'a life of usefulness, literature, and religion, was not by any means a life of event.', 'to those who lament their irreparable loss, it is consolatory to think that, as she never deserved disapprobation, so, in the circle of her family and friends, she never met reproof; that her wishes were not only reasonable, but gratified; and that to the little disappointments incidental to human life was never ad

In [16]:
mansfield_park_nltk_word_tokenized = tokenize_with_nltk(austen_books[0], 'word')

print(mansfield_park_nltk_word_tokenized[1000:1050])

['the', 'present', 'age', 'it', 'is', 'hazardous', 'to', 'mention', 'accomplishments', '.', 'our', 'authoress', 'would', ',', 'probably', ',', 'have', 'been', 'inferior', 'to', 'few', 'in', 'such', 'acquirements', ',', 'had', 'she', 'not', 'been', 'so', 'superior', 'to', 'most', 'in', 'higher', 'things', '.', 'she', 'had', 'not', 'only', 'an', 'excellent', 'taste', 'for', 'drawing', ',', 'but', ',', 'in']


nltk - remarks 

remove punctuation - if you really wanted to, you have to do it again manually

alternatively nltk has an api that lets you use your own Regexes as the delimiters but this can lead to other issues

also you need to be careful about the end result here because you might not intend for Don't to split apart into 2 words.
    
remove numbers - similar to above, but less side effects.

In [42]:
## Text metrics

## unigrams (tokens)
pride = austen_books_tokenized[4]

print(pride[:100])

pride_unigrams = list(set(pride))

## word frequencies, then return top ten
counts = {}
for word in pride:
    if (word not in counts):
        counts[word] = 1
    else:
        counts[word] += 1
        
## unsorted output:
print({k: counts[k] for k in list(counts)[:10]})

## now sort the dictionary by value
sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
## use list comprehension to draw top 10 most frequent words
print([word + ": " + str(freq) for word, freq in sorted_counts[:10]])

['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife', 'however', 'little', 'known', 'the', 'feelings', 'or', 'views', 'of', 'such', 'a', 'man', 'may', 'be', 'on', 'his', 'first', 'entering', 'a', 'neighbourhood', 'this', 'truth', 'is', 'so', 'well', 'fixed', 'in', 'the', 'minds', 'of', 'the', 'surrounding', 'families', 'that', 'he', 'is', 'considered', 'the', 'rightful', 'property', 'of', 'some', 'one', 'or', 'other', 'of', 'their', 'daughters', 'my', 'dear', 'mr', 'bennet', 'said', 'his', 'lady', 'to', 'him', 'one', 'day', 'have', 'you', 'heard', 'that', 'netherfield', 'park', 'is', 'let', 'at', 'last', 'mr', 'bennet', 'replied', 'that', 'he', 'had', 'not', 'but', 'it']
{'it': 1520, 'is': 858, 'a': 1941, 'truth': 27, 'universally': 3, 'acknowledged': 20, 'that': 1566, 'single': 12, 'man': 142, 'in': 1861}
['the: 4321', 'to: 4127', 'of: 3596', 'and: 3528', 

In [47]:
pride = austen_books_tokenized[4]

## most frequent bigrams
bigrams = {}
## decrement range by 2 to avoid going out of bounds
for i in range(len(pride) - 2):
    bigram = ' '.join(pride[i:i+2])
    if (bigram not in bigrams):
        bigrams[bigram] = 1
    else:
        bigrams[bigram] += 1
        
## sorted call: (dictionary items, key=operator.itemgetter(1), reverse = True)
sorted_bigrams = sorted(bigrams.items(), key=operator.itemgetter(1), reverse = True)

print([word + ": " + str(val) for word, val in sorted_bigrams[:10]])

['of the: 463', 'to be: 442', 'in the: 382', 'i am: 301', 'of her: 260', 'to the: 251', 'it was: 250', 'mr darcy: 241', 'of his: 235', 'she was: 212']


In [61]:
pride = austen_books_tokenized[4]
## trigrams!
trigrams = {}

for i in range(len(pride) - 3):
    trigram = ' '.join(pride[i: i+3])
    if (trigram not in trigrams):
        trigrams[trigram] = 1
    else:
        trigrams[trigram] += 1

## remove any trigram that has a value of 1.
# print(len(trigrams))
trigrams = {word:freq for (word,freq) in trigrams.items() if freq > 20}
# print(len(trigrams))

## now do the sorting.
sorted_trigrams = sorted(trigrams.items(), key=operator.itemgetter(1), reverse = True)

print("\n".join([word + ": " + str(val) for word, val in sorted_trigrams]))

i do not: 61
i am sure: 61
as soon as: 55
she could not: 49
that he had: 37
in the world: 34
it would be: 33
i am not: 32
i dare say: 31
it was not: 30
could not be: 30
that he was: 29
that it was: 28
on the subject: 28
would have been: 27
as well as: 27
by no means: 26
and she was: 25
one of the: 25
he had been: 25
that she had: 24
the rest of: 23
i did not: 23
a great deal: 23
in spite of: 23
it was a: 23
do not know: 22
i have not: 22
uncle and aunt: 22
she did not: 22
mrs bennet was: 21
not to be: 21


In [172]:
## td-idf

## where tf-df = frequency of word W in doc A / number of docs word A appears in

## input / output ...
## we need all the docs
## check on a per doc level> for each doc> for each of its words ... 
def tf_idf(docs):
    
    uniques = list([list(set(doc)) for doc in docs])
    
    vocabulary = list(set(sum(uniques, [])))
        
    # arrange a matrix where each column is a word in the vocab and each row is a book
    
    # todo - do that.
    for doc in docs:
        for word in doc:
            
    
    return "tim"

In [173]:
tf_idf(austen_books_tokenized)

'tim'

In [124]:
test1 = ["absence", "oh", "absence", "add", "to"]
test2 = ["absence", "add", "to", "to", "to"]
l = [test1, test2]

print(set(list(itertools.chain.from_iterable(l))))

{'add', 'oh', 'absence', 'to'}
