# Code it: Trisyllables in Walden

## Sample code


In [None]:
###########
## Setup ##
###########

## This new notebook is a new environment, so we have to set it up again.

# Import packages
from nltk import *
import pickle
import string
from nltk.corpus import cmudict

# Custom function to count the syllables for each word passed to it
def syllables(word):
    # Only look up the word if it's in CMUdict (set of 134,000 words)
    if word in cmu:
        # If it's in CMUdict, print the number of digits in the transcription
        return max([len([y for y in x if y[-1] in string.digits])
                    for x in cmu[word]])

# Import the Walden data we saved from the Text Cleaning chapter

walden_tokens = pickle.load( open( "../working/walden_clean_tokens.pkl", "rb" ) )

# Create a shortcut to the CMUdict produnciation dictionary as 'cmudict.dict()'
cmu = cmudict.dict()

In [None]:
# Regenerate the frequency distribution across the tokens to extract the most common words
token_frequencies = FreqDist( walden_tokens )

# A placeholder for our trisyllable words
trisyl_tokens =[]

# We will create a loop that goes through each token in token_frequencies
#      For each token, we will look it up in the CMUdict and,
#           if the syllable count equals3, we will save it to a new list
for token in token_frequencies:

    # Look up the token pronunciation and count the digits in CMUdict to get the number of syllables
    syl_count = syllables(token)
    
    # If the syllable count is equal to three...
    if syl_count == 3:
        
        # ...print a message.
        #print( "Token", token, "contains 3 syllables and occurs", token_frequencies[token], "times.")
        
        # Add our token to the list
        trisyl_tokens.append( [token, token_frequencies[token]] )
        
# Sort the list from most to least frequent (second element in each sublist)
trisyl_tokens = sorted(trisyl_tokens, key=lambda x: x[1], reverse = True)

# Now we have a sorted list of lists (our favorite) with each list containing the 3-syllable token and its frequency
# Let's print the top 15 to see how we did!
trisyl_tokens[:15]