In [3]:
import nltk
import numpy as np
import os

from nltk.book import *
from nltk.corpus import stopwords

sw = stopwords.words('english')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [8]:
text1

<Text: Moby Dick by Herman Melville 1851>

These first two functions are modified from the pattern assignrnent. 

In [4]:
def token_normal(text): 

    # Lowercase and split on whitespace 
    text = text.lower().strip().split() 

    # Drop non—alpha and stopwords 
    text =  [w for w in text if w not in sw and w.isalpha()]

    return(text) 



In [16]:
def get_patterns(text, num_words):
    """ 
    This function takes text as an input and returns a dictionary Of statistics, 
    after cleaning the text. 
    """
    if (len(text) == 0):
        raise ValueError("Can't work with empty text object.") 


    # We'll make things a big clearer by the 
    # statistics here. These are placeholder values. 
    total_tokens = 1
    unique_tokens = 0
    avg_token_len = 0.0
    lex_diversity = 0.0
    top_words =[]
    
    text = token_normal(text) 

    if len(text) == 0:
        raise ValueError( " All of text is stopwords! " ) 


    # Calculate your statistics here 
    total_tokens = len(text) 
    unique_tokens = len(set(text)) 
    avg_token_len = np.mean([len(w) for w in text]) 
    lex_diversity = unique_tokens/total_tokens

    top_words = FreqDist(text).most_common(num_words) 
    
    # Now we'll fill out the dictionary. 
    results = { 'tokens' : total_tokens, 
            'unique_tokens' : unique_tokens,
            'avg_token_length' : avg_token_len,
            'lexical_diversity': lex_diversity,
            'top_words': top_words} 

    return(results) 

These functions are useful for the second part of the assignment. You may have seen these at the end of a class when I was coding. 


In [17]:
def get_word_frac(word, fd_corpus, length):  

    if word in fd_corpus:
        return(fd_corpus[word]/length) 
    else: 
        return(0)

def get_ratio(word, fd_corpus_1, fd_corpus_2, len_1, len_2):

    frac_1 = get_word_frac(word, fd_corpus_1, len_1) 
    frac_2 = get_word_frac(word, fd_corpus_2, len_2) 

    if frac_2 > 0:
        return(frac_1/frac_2) 
    else:
        return(float('NaN')) 

Now we start with the main function. 

In [20]:

def compare_texts(corpus_1, corpus_2, num_words = 10, ratio_cutoff=5): 
    """
    This function returns a nested dictionary with information comparing two groups Of 
    text. See README for full description of what this function does. 
    """
    results = dict() 

    # Get the first two parts done with a function 
    results["one"] = get_patterns(corpus_1, num_words)
    results["two"] = get_patterns(corpus_2, num_words)

    # Now we start the ratio part. Cleaning first, then build 
    # frequency distributions 
    corpus_1 = token_normal(corpus_1) 
    corpus_2 =  token_normal(corpus_2)
    
    fd_1 = FreqDist(corpus_1) 
    fd_2 = FreqDist(corpus_2)

    # It's handy to have a set of the words in each corpus. 
    
    fd_1_words = set(fd_1.keys()) 
    fd_2_words = set(fd_2.keys()) 
    
    # This will hold our ratios. Starting with 1 over 2 
    holder = dict() 
    
    # Also, we need to tell Python that the "one_vs two" spot holds 
    # a dictionary. (And "two vs one") 
    results["one_vs_two"] = dict() 
    results["two_vs_one"] = dict() 
    
    # Now we add them. We check along the to make Sure 
    for word, count in fd_1.items():
        if count > ratio_cutoff:
            # This next line makes use of the fact that 
            # Python stops evaluating "and" expressions if it hits a False 
            if word in fd_2_words and fd_2[word] > ratio_cutoff:
                holder[word] = get_ratio(word, fd_1, fd_2,
                results["one"]["tokens"],
                results["two"]["tokens"])
    
    num_added = 0

    for word, frac in sorted(holder.items() , key=lambda item: -1*item[1]):
        results["one_vs_two"][word] = frac
        num_added += 1
        if num_added == num_words:
            break 

    # Now we do the same for 2 vs 1!
    holder = dict() 
    
    # Now we add them. We check along the to make Sure 
    for word, count in fd_2.items():
        if count > ratio_cutoff:
            # This next line makes use of the fact that 
            # Python stops evaluating "and" expressions if it hits a False 
            if word in fd_1_words and fd_1[word] > ratio_cutoff:
                holder[word] = get_ratio(word, fd_2, fd_1,
                results["two"]["tokens"],
                results["one"]["tokens"])
    
    num_added = 0    
    
    for word, frac in sorted(holder.items() , key=lambda item: -1*item[1]):
        results["two_vs_one"][word] = frac
        num_added += 1
        if num_added == num_words:
            break 
    
    return(results) 

Now let's just test it quickly on textl (Moby Dick) and text2 (Sense and Sensibility) before we finish the assignment. 

In [21]:
compare_texts ( " ".join(text1) , " ".join(text2), num_words=5) 

{'one': {'tokens': 110459,
  'unique_tokens': 16802,
  'avg_token_length': 5.847671986891064,
  'lexical_diversity': 0.15211073792085752,
  'top_words': [('whale', 1226),
   ('one', 921),
   ('like', 647),
   ('upon', 566),
   ('man', 527)]},
 'two': {'tokens': 53986,
  'unique_tokens': 6148,
  'avg_token_length': 6.225817804616011,
  'lexical_diversity': 0.11388137665320638,
  'top_words': [('elinor', 685),
   ('could', 578),
   ('marianne', 566),
   ('mrs', 530),
   ('would', 515)]},
 'one_vs_two': {'sort': 8.254316584031683,
  'god': 7.428884925628514,
  'wind': 5.7019950087060955,
  'along': 5.37616672249432,
  'boy': 5.294709650941375},
 'two_vs_one': {'sister': 96.16517245211722,
  'mrs': 83.41659803538809,
  'lady': 36.31769810691661,
  'mother': 29.326967485397446,
  'john': 25.811928448686125}}