In [None]:
# 4   Writing Structured Programs

In [None]:
# 4.1   Back to the Basics

In [None]:
# Conditions

In [None]:
# In the condition part of an if statement, a nonempty string or list is evaluated as true, while an empty string or list evaluates as false.

In [1]:
mixed = ['cat', '', ['dog'], []]

In [2]:
for element in mixed:
    if element:
        print(element)

cat
['dog']


In [3]:
# What's the difference between using if...elif as opposed to using a couple of if statements in a row? Well, consider the following situation:

In [4]:
animals = ['cat', 'dog']

In [5]:
if 'cat' in animals:
    print(1)
elif 'dog' in animals:
    print(2)

1


In [6]:
if 'cat' in animals:
    print(1)
if 'dog' in animals:
    print(2)

1
2


In [None]:
#Since the if clause of the statement is satisfied, Python never tries to evaluate the elif clause, so we never get to print out 2. By contrast, if we replaced the elif by an if, then we would print out both 1 and 2. So an elif clause potentially gives us more information than a bare if clause; when it evaluates to true, it tells us not only that the condition is satisfied, but also that the condition of the main if clause was not satisfied.

In [None]:
# The functions all() and any() can be applied to a list (or other sequence) to check whether all or any items meet some condition:

In [7]:
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']

In [8]:
all(len(w) > 4 for w in sent)

False

In [9]:
any(len(w) > 4 for w in sent)

True

In [None]:
# 4.2   Sequences

In [None]:

# So far, we have seen two kinds of sequence object: strings and lists. Another kind of sequence is called a tuple. Tuples are formed with the comma operator, and typically enclosed using parentheses. We've actually seen them in the previous chapters, and sometimes referred to them as "pairs", since there were always two members. 

In [None]:

# However, tuples can have any number of members. Like lists and strings, tuples can be indexed and sliced, and have a length 

In [10]:
t = 'walk', 'fem', 3

In [11]:
t

('walk', 'fem', 3)

In [12]:
t[0]

'walk'

In [13]:
t[1:]

('fem', 3)

In [14]:

len(t)

3

In [None]:
# Let's compare strings, lists and tuples directly, and do the indexing, slice, and length operation on each type:

In [17]:
raw = 'I turned off the spectroroute'

In [18]:
text = ['I', 'turned', 'off', 'the', 'spectroroute']

In [19]:

pair = (6, 'turned')

In [22]:
pair

(6, 'turned')

In [20]:
raw[2], text[3], pair[1]

('t', 'the', 'turned')

In [21]:
raw[-3:], text[-3:], pair[-2:]

('ute', ['off', 'the', 'spectroroute'], (6, 'turned'))

In [23]:
len(raw), len(text), len(pair)

(29, 5, 2)

In [None]:
# Various ways to iterate over sequences

In [None]:
#Python Expression	Comment
#for item in s	iterate over the items of s
#for item in sorted(s)	iterate over the items of s in order
#for item in set(s)	iterate over unique elements of s
#for item in reversed(s)	iterate over elements of s in reverse
#for item in set(s).difference(t)	iterate over elements of s not in t

In [None]:
# We can convert between these sequence types. For example, tuple(s) converts any kind of sequence into a tuple, and list(s) converts any kind of sequence into a list. We can convert a list of strings to a single string using the join() function, e.g. ':'.join(words).

In [24]:
tuple(raw)

('I',
 ' ',
 't',
 'u',
 'r',
 'n',
 'e',
 'd',
 ' ',
 'o',
 'f',
 'f',
 ' ',
 't',
 'h',
 'e',
 ' ',
 's',
 'p',
 'e',
 'c',
 't',
 'r',
 'o',
 'r',
 'o',
 'u',
 't',
 'e')

In [25]:
tuple(text)

('I', 'turned', 'off', 'the', 'spectroroute')

In [26]:
list(raw)

['I',
 ' ',
 't',
 'u',
 'r',
 'n',
 'e',
 'd',
 ' ',
 'o',
 'f',
 'f',
 ' ',
 't',
 'h',
 'e',
 ' ',
 's',
 'p',
 'e',
 'c',
 't',
 'r',
 'o',
 'r',
 'o',
 'u',
 't',
 'e']

In [27]:
list(pair)

[6, 'turned']

In [28]:
':'.join(text)

'I:turned:off:the:spectroroute'

In [None]:
# Some other objects, such as a FreqDist, can be converted into a sequence (using list() or sorted()) and support iteration, e.g.

In [29]:
import nltk

In [30]:
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'

In [31]:
text = nltk.word_tokenize(raw)

In [32]:
text

['Red',
 'lorry',
 ',',
 'yellow',
 'lorry',
 ',',
 'red',
 'lorry',
 ',',
 'yellow',
 'lorry',
 '.']

In [33]:
fdist = nltk.FreqDist(text)

In [34]:
fdist

FreqDist({'lorry': 4, ',': 3, 'yellow': 2, 'Red': 1, 'red': 1, '.': 1})

In [35]:
list(fdist)

['Red', 'lorry', ',', 'yellow', 'red', '.']

In [36]:
for key in fdist:
    print(key + ':', fdist[key], end='; ')

Red: 1; lorry: 4; ,: 3; yellow: 2; red: 1; .: 1; 

In [None]:
# In the next example, we use tuples to re-arrange the contents of our list. (We can omit the parentheses because the comma has higher precedence than assignment.)

In [37]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']

In [38]:
words[2], words[3], words[4] = words[3], words[4], words[2]

In [39]:
words

['I', 'turned', 'the', 'spectroroute', 'off']

In [40]:
(words[2], words[3], words[4]) = (words[3], words[4], words[2])

In [41]:
words

['I', 'turned', 'spectroroute', 'off', 'the']

In [42]:
# Exercise 1: Use tuples to rearrange the contents of a list (from: Ex=["we","take","into","account","this","fact"]) to Ex=["we","take","this","fact","into","account"]

In [51]:
Ex = ["we","take","into","account","this","fact"]

In [52]:
Ex[2], Ex[3], Ex[4], Ex[5] =  Ex[4], Ex[5], Ex[2], Ex[3]

In [53]:
Ex

['we', 'take', 'this', 'fact', 'into', 'account']

In [None]:
# This is an idiomatic and readable way to move items inside a list. It is equivalent to the following traditional way of doing such tasks that does not use tuples (notice that this method needs a temporary variable tmp).



In [55]:
tmp = words[2]

In [56]:
words[2] = words[3]

In [57]:
words[3] = words[4]

In [58]:
words[4] = tmp

In [None]:
#  There are also functions that modify the structure of a sequence and which can be handy for language processing. Thus, zip() takes the items of two or more sequences and "zips" them together into a single list of tuples. 


In [59]:



words = ['I', 'turned', 'off', 'the', 'spectroroute']

In [60]:


tags=['noun', 'verb', 'prep', 'det', 'noun']

In [61]:
zip(words, tags)

<zip at 0x1ea112efb88>

In [62]:

list(zip(words, tags))

[('I', 'noun'),
 ('turned', 'verb'),
 ('off', 'prep'),
 ('the', 'det'),
 ('spectroroute', 'noun')]

In [None]:
# Given a sequence s, enumerate(s) returns pairs consisting of an index and the item at that index.

In [63]:
list(enumerate(words))

[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

In [None]:
# For some NLP tasks it is necessary to cut up a sequence into two or more parts. For instance, we might want to "train" a system on 90% of the data and test it on the remaining 10%. To do this we decide the location where we want to cut the data [1], then cut the sequence at that location [2].



In [64]:
 text = nltk.corpus.nps_chat.words()

In [65]:
cut = int(0.9 * len(text)) 

In [66]:
training_data, test_data = text[:cut], text[cut:]

In [67]:
text == training_data + test_data

True

In [68]:
len(training_data) / len(test_data)

9.0

In [None]:
# Exercise 2. Please divide names corpus into two parts. (95% of the data is used to "train" the model and 5% of data is used to test the model).

In [69]:
import nltk

In [70]:
text = nltk.corpus.names.words()

In [71]:
cut = int(0.95 * len(text)) 

In [72]:
training_data, test_data = text[:cut], text[cut:]

In [73]:
len(training_data) / len(test_data)

18.959798994974875

In [None]:
# Combining Different Sequence Types


In [None]:
# Let's combine our knowledge of these three sequence types, together with list comprehensions, to perform the task of sorting the words in a string by their length.

In [74]:
words = 'I turned off the spectroroute'.split() 

In [75]:
words

['I', 'turned', 'off', 'the', 'spectroroute']

In [76]:
wordlens = [(len(word), word) for word in words]

In [77]:
wordlens

[(1, 'I'), (6, 'turned'), (3, 'off'), (3, 'the'), (12, 'spectroroute')]

In [79]:
wordlens.sort()

In [80]:
wordlens

[(1, 'I'), (3, 'off'), (3, 'the'), (6, 'turned'), (12, 'spectroroute')]

In [81]:
' '.join(w for (_, w) in wordlens)

'I off the turned spectroroute'

In [None]:
# Each of the above lines of code contains a significant feature. A simple string is actually an object with methods defined on it such as split() [1]. We use a list comprehension to build a list of tuples [2], where each tuple consists of a number (the word length) and the word, e.g. (3, 'the'). We use the sort() method [3] to sort the list in-place. Finally, we discard the length information and join the words back into a single string [4]. (The underscore [4] is just a regular Python variable, but we can use underscore by convention to indicate that we will not use its value.)

In [None]:
# We began by talking about the commonalities in these sequence types, but the above code illustrates important differences in their roles. 

In [None]:
# A list is typically a sequence of objects all having the same type, of arbitrary length. We often use lists to hold sequences of words. In contrast, a tuple is typically a collection of objects of different types, of fixed length. We often use a tuple to hold a record, a collection of different fields relating to some entity. 

In [None]:
# This distinction between the use of lists and tuples takes some getting used to, so here is another example:

In [82]:
lexicon = [
        ('the', 'det', ['Di:', 'D@']),
        ('off', 'prep', ['Qf', 'O:f'])
]

In [83]:
# Here, a lexicon is represented as a list because it is a collection of objects of a single type — lexical entries — of no predetermined length. An individual entry is represented as a tuple because it is a collection of objects with different interpretations, such as the orthographic form, the part of speech, and the pronunciations 

In [84]:
# A good way to decide when to use tuples vs lists is to ask whether the interpretation of an item depends on its position. For example, a tagged token combines two strings having different interpretation, and we choose to interpret the first item as the token and the second item as the tag. Thus we use tuples like this: ('class', 'noun'); a tuple of the form ('noun', 'class') would be nonsensical since it would be a word noun tagged class. In contrast, the elements of a text are all tokens, and position is not significant. Thus we use lists like this: ['venetian', 'blind']; a list of the form ['blind', 'venetian'] would be equally valid. The linguistic meaning of the words might be different, but the interpretation of list items as tokens is unchanged.



In [85]:
# The distinction between lists and tuples has been described in terms of usage. However, there is a more fundamental difference: in Python, lists are mutable, while tuples are immutable. In other words, lists can be modified, while tuples cannot. Here are some of the operations on lists that do in-place modification of the list.



In [86]:
lexicon.sort()

In [87]:
lexicon

[('off', 'prep', ['Qf', 'O:f']), ('the', 'det', ['Di:', 'D@'])]

In [88]:
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])

In [89]:
 del lexicon[0]

In [90]:
lexicon

[('turned', 'VBD', ['t3:nd', 't3`nd'])]

In [None]:
# Generator Expressions


In [None]:
# We've been making heavy use of list comprehensions, for compact and readable processing of texts. Here's an example where we tokenize and normalize a text

In [91]:
text = '''"When I use a word," Humpty Dumpty said in rather a scornful tone,
  "it means just what I choose it to mean - neither more nor less."'''

In [92]:
[w.lower() for w in nltk.word_tokenize(text)]

['``',
 'when',
 'i',
 'use',
 'a',
 'word',
 ',',
 "''",
 'humpty',
 'dumpty',
 'said',
 'in',
 'rather',
 'a',
 'scornful',
 'tone',
 ',',
 '``',
 'it',
 'means',
 'just',
 'what',
 'i',
 'choose',
 'it',
 'to',
 'mean',
 '-',
 'neither',
 'more',
 'nor',
 'less',
 '.',
 "''"]

In [93]:
# Suppose we now want to process these words further. We can do this by inserting the above expression inside a call to some other function , but Python allows us to omit the brackets

In [94]:
sorted([w.lower() for w in nltk.word_tokenize(text)])

["''",
 "''",
 ',',
 ',',
 '-',
 '.',
 '``',
 '``',
 'a',
 'a',
 'choose',
 'dumpty',
 'humpty',
 'i',
 'i',
 'in',
 'it',
 'it',
 'just',
 'less',
 'mean',
 'means',
 'more',
 'neither',
 'nor',
 'rather',
 'said',
 'scornful',
 'to',
 'tone',
 'use',
 'what',
 'when',
 'word']

In [95]:
max([w.lower() for w in nltk.word_tokenize(text)]) # 1

'word'

In [96]:
max(w.lower() for w in nltk.word_tokenize(text)) #2

'word'

In [None]:
# The second line uses a generator expression. This is more than a notational convenience: in many language processing situations, generator expressions will be more efficient. In [1], storage for the list object must be allocated before the value of max() is computed. If the text is very large, this could be slow. In [2], the data is streamed to the calling function. Since the calling function simply has to find the maximum value — the word which comes latest in lexicographic sort order — it can process the stream of data without having to store anything more than the maximum value seen so far.

In [None]:
# Q and A

In [None]:
# 4.3   Questions of Style (See Slides)

In [None]:
# Python Coding Style

In [None]:
# Procedural vs Declarative Style


In [None]:
# We have just seen how the same task can be performed in different ways, with implications for efficiency. Another factor influencing program development is programming style. Consider the following program to compute the average length of words in the Brown Corpus:

In [97]:
tokens = nltk.corpus.brown.words(categories='news')

In [98]:
count = 0

In [99]:
total=0

In [100]:
for token in tokens:
    count+=1
    total+=len(token)

In [101]:
total/count

4.401545438271973

In [None]:
# In this program we use the variable count to keep track of the number of tokens seen, and total to store the combined length of all words.

In [None]:
# The two variables are just like a CPU's registers, accumulating values at many intermediate stages, values that are meaningless until the end.

In [None]:
#We say that this program is written in a procedural style, dictating the machine operations step by step.

In [None]:
# Now consider the following program that computes the same thing:

In [102]:
total = sum(len(t) for t in tokens)

In [103]:
print(total / len(tokens))

4.401545438271973


In [None]:
#The first line uses a generator expression to sum the token lengths, while the second line computes the average as before. Each line of code performs a complete, meaningful task, which can be understood in terms of high-level properties like: "total is the sum of the lengths of the tokens". Implementation details are left to the Python interpreter. The second program uses a built-in function, and constitutes programming at a more abstract level; the resulting code is more declarative. 

In [None]:
# Another case where a loop variable seems to be necessary is for printing a counter with each line of output. Instead, we can use enumerate(), which processes a sequence s and produces a tuple of the form (i, s[i]) for each item in s, starting with (0, s[0]). Here we enumerate the key-value pairs of the frequency distribution, resulting in nested tuples (rank, (word, count)). We print rank+1 so that the counting appears to start from 1, as required when producing a list of ranked items.



In [104]:
fd = nltk.FreqDist(nltk.corpus.brown.words())

In [105]:
cumulative = 0.0

In [106]:
most_common_words = [word for (word, count) in fd.most_common()]

In [107]:
fd.most_common()[0:10]

[('the', 62713),
 (',', 58334),
 ('.', 49346),
 ('of', 36080),
 ('and', 27915),
 ('to', 25732),
 ('a', 21881),
 ('in', 19536),
 ('that', 10237),
 ('is', 10011)]

In [108]:
most_common_words[0:10]

['the', ',', '.', 'of', 'and', 'to', 'a', 'in', 'that', 'is']

In [109]:
list(enumerate(most_common_words[0:10]))

[(0, 'the'),
 (1, ','),
 (2, '.'),
 (3, 'of'),
 (4, 'and'),
 (5, 'to'),
 (6, 'a'),
 (7, 'in'),
 (8, 'that'),
 (9, 'is')]

In [110]:
for rank, word in enumerate(most_common_words):
        cumulative += fd.freq(word)
        print("%3d %6.2f%% %s" % (rank + 1, cumulative * 100, word))
        if cumulative > 0.25: 
            break

  1   5.40% the
  2  10.42% ,
  3  14.67% .
  4  17.78% of
  5  20.19% and
  6  22.40% to
  7  24.29% a
  8  25.97% in


In [None]:
# Exercise 3: Use enumerate () fuction to rank the cumulative frequency (30%) of the most commmon words in web text corpus.

In [223]:
fd = nltk.FreqDist(nltk.corpus.webtext.words())

In [224]:
cumulative=0.0

In [225]:
most_common_words = [word for (word, count) in fd.most_common()]

In [226]:
for rank, word in enumerate(most_common_words):
        cumulative += fd.freq(word)
        print("%3d %6.2f%% %s" % (rank + 1, cumulative * 100, word))
        if cumulative > 0.30: 
            break

  1   4.46% .
  2   8.02% :
  3  11.12% ,
  4  13.83% '
  5  15.80% I
  6  17.65% the
  7  19.22% to
  8  20.67% a
  9  21.81% you
 10  22.90% ?
 11  23.96% in
 12  24.99% and
 13  26.01% !
 14  26.95% #
 15  27.83% t
 16  28.66% -
 17  29.48% s
 18  30.30% on


In [None]:
# It's sometimes tempting to use loop variables to store a maximum or minimum value seen so far. Let's use this method to find the longest word in a text.

In [111]:
text = nltk.corpus.gutenberg.words('milton-paradise.txt')

In [112]:
longest = ''

In [113]:
for word in text:
    if len(word) > len(longest):
        longest = word

In [114]:
longest

'unextinguishable'

In [None]:
# However, a more transparent solution uses two list comprehensions, both having forms that should be familiar by now:

In [115]:
maxlen = max(len(word) for word in text)

In [116]:
maxlen

16

In [117]:
[word for word in text if len(word) == maxlen]

['unextinguishable',
 'transubstantiate',
 'inextinguishable',
 'incomprehensible']

In [None]:
# Note that our first solution found the first word having the longest length, while the second solution found all of the longest words (which is usually what we would want). 

In [None]:
# Exercise 4: Use a list comprehension to find out the longest words in "shakespeare-macbeth.txt" in gutenberg corpus.

In [122]:
text = nltk.corpus.gutenberg.words('shakespeare-macbeth.txt')

In [123]:
maxlen = max(len(word) for word in text)

In [124]:
maxlen

15

In [125]:
[word for word in text if len(word) == maxlen]

['Voluptuousnesse']

In [None]:
# Some Legitimate Uses for Counters

In [None]:
# There are cases where we still want to use loop variables in a list comprehension. For example, we need to use a loop variable to extract successive overlapping n-grams from a list:

In [126]:
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']

In [127]:
n = 3

In [128]:
[sent[i:i+n] for i in range(len(sent)-n+1)]

[['The', 'dog', 'gave'],
 ['dog', 'gave', 'John'],
 ['gave', 'John', 'the'],
 ['John', 'the', 'newspaper']]

In [None]:
#It is quite tricky to get the range of the loop variable right. Since this is a common operation in NLP, NLTK supports it with functions bigrams(text) and trigrams(text), and a general purpose ngrams(text, n).

In [129]:
list(nltk.bigrams(sent))

[('The', 'dog'),
 ('dog', 'gave'),
 ('gave', 'John'),
 ('John', 'the'),
 ('the', 'newspaper')]

In [131]:
list(nltk.trigrams(sent))

[('The', 'dog', 'gave'),
 ('dog', 'gave', 'John'),
 ('gave', 'John', 'the'),
 ('John', 'the', 'newspaper')]

In [132]:
list(nltk.ngrams(sent,4))

[('The', 'dog', 'gave', 'John'),
 ('dog', 'gave', 'John', 'the'),
 ('gave', 'John', 'the', 'newspaper')]

In [None]:
# Here's an example of how we can use loop variables in building multidimensional structures. For example, to build an array with m rows and n columns, where each cell is a set, we could use a nested list comprehension:

In [133]:
pip install pprint

Collecting pprint
Note: you may need to restart the kernel to use updated packages.


  ERROR: Could not find a version that satisfies the requirement pprint (from versions: none)
ERROR: No matching distribution found for pprint


In [135]:
import pprint

In [137]:
m, n = 3, 7

In [138]:
array = [[set() for i in range(n)] for j in range(m)]

In [139]:
array

[[set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), set(), set()]]

In [140]:
array[2][5].add('Alice')

In [141]:
 pprint.pprint(array)

[[set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), set(), set()],
 [set(), set(), set(), set(), set(), {'Alice'}, set()]]


In [142]:
print(array)

[[set(), set(), set(), set(), set(), set(), set()], [set(), set(), set(), set(), set(), set(), set()], [set(), set(), set(), set(), set(), {'Alice'}, set()]]


In [None]:
# 4.4   Functions: The Foundation of Structured Programming

In [None]:
# Function Inputs and Outputs

In [None]:
# We pass information to functions using a function's parameters, the parenthesized list of variables and constants following the function's name in the function definition. Here's a complete example:



In [143]:
def repeat(msg, num):
    return ' '.join([msg] * num)

In [144]:
monty = 'Monty Python'

In [145]:

repeat(monty, 3)

'Monty Python Monty Python Monty Python'

In [146]:
# It is not necessary to have any parameters, as we see in the following example:

In [147]:
def monty():
    return "Monty Python"

In [148]:
monty()

'Monty Python'

In [None]:
# A function usually communicates its results back to the calling program via the return statement, as we have just seen. To the calling program, it looks as if the function call had been replaced with the function's result, e.g.:

In [149]:
repeat(monty(), 3)

'Monty Python Monty Python Monty Python'

In [150]:
repeat('Monty Python', 3)

'Monty Python Monty Python Monty Python'

In [None]:
# Consider the following three sort functions. The third one is dangerous because a programmer could use it without realizing that it had modified its input. In general, functions should modify the contents of a parameter (my_sort1()), or return a value (my_sort2()), not both (my_sort3()).

In [151]:
# good: modifies its argument, no return value

In [152]:
def my_sort1(mylist):
    mylist.sort()

In [None]:
# good: doesn't touch its argument, returns value

In [153]:
def my_sort2(mylist):
    return sorted(mylist)

In [None]:
# bad: modifies its argument and also returns it

In [154]:
def my_sort3(mylist):
    mylist.sort()
    return mylist

In [None]:
# Checking Parameter Types

In [None]:
# Python does not allow us to declare the type of a variable when we write a program, and this permits us to define functions that are flexible about the type of their arguments. For example, a tagger might expect a sequence of words, but it wouldn't care whether this sequence is expressed as a list or a tuple 

In [None]:
#However, often we want to write programs for later use by others, and want to program in a defensive style, providing useful warnings when functions have not been invoked correctly. The author of the following tag() function assumed that its argument would always be a string.

In [155]:
def tag(word):
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

In [156]:
tag('the')

'det'

In [157]:
tag('knight')

'noun'

In [158]:
tag(["'Tis", 'but', 'a', 'scratch'])

'noun'

In [None]:
#  The author of this function could take some extra steps to ensure that the word parameter of the tag() function is a string. A naive approach would be to check the type of the argument using if not type(word) is str, and if word is not a string, to simply return Python's special empty value, None. 

In [None]:
# This is a slight improvement, because the function is checking the type of the argument, and trying to return a "special", diagnostic value for the wrong input. However, it is also dangerous because the calling program may not detect that None is intended as a "special" value, and this diagnostic return value may then be propagated to other parts of the program with unpredictable consequences. 

In [None]:
# This approach also fails if the word is a Unicode string, which has type unicode, not str. 

In [None]:
# Here's a better solution, using an assert statement together with Python's basestring type that generalizes over both unicode and str.

In [159]:
def tag(word):
    assert isinstance(word, str),"argument to tag() must be a string"
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

In [160]:
tag(["'Tis", 'but', 'a', 'scratch'])

AssertionError: argument to tag() must be a string

In [None]:
# If the assert statement fails, it will produce an error that cannot be ignored, since it halts program execution. 

In [None]:
# Functional Decomposition

In [None]:
# Well-structured programs usually make extensive use of functions. When a block of program code grows longer than 10-20 lines, it is a great help to readability if the code is broken up into one or more functions, each one having a clear purpose. This is analogous to the way a good essay is divided into paragraphs, each expressing one main idea.

In [None]:
#Functions provide an important kind of abstraction. They allow us to group multiple actions into a single, complex action, and associate a name with it. (Compare this with the way we combine the actions of go and bring back into a single more complex action fetch.) When we use functions, the main program can be written at a higher level of abstraction, making its structure transparent, e.g

In [None]:
# Appropriate use of functions makes programs more readable and maintainable. Additionally, it becomes possible to reimplement a function — replacing the function's body with more efficient code — without having to be concerned with the rest of the program.

In [None]:
# Consider the freq_words function in 4.3. It updates the contents of a frequency distribution that is passed in as a parameter, and it also prints a list of the n most frequent words.

In [161]:
import nltk

In [162]:
from urllib import request
from bs4 import BeautifulSoup

def freq_words(url, freqdist, n):
    html = request.urlopen(url).read().decode('utf8')
    raw = BeautifulSoup(html, 'html.parser').get_text()
    for word in nltk.word_tokenize(raw):
        freqdist[word.lower()] += 1
    result = []
    for word, count in freqdist.most_common(n):
        result = result + [word]
    print(result)

In [163]:
constitution = "http://www.archives.gov/exhibits/charters/constitution_transcript.html"

In [164]:
fd = nltk.FreqDist()

In [165]:

freq_words(constitution, fd, 30)

["''", ',', ':', ':1', ';', 'the', '(', ')', '``', '{', '}', 'of', '?', 'url', 'https', '@', 'import', 'qh4t45', "'", 'archives', '#', 'and', '.', '[', ']', 'national', 'a', 'documents', 'constitution', 'founding']


In [166]:
# This function has a number of problems. The function has two side-effects: it modifies the contents of its second parameter, and it prints a selection of the results it has computed. The function would be easier to understand and to reuse elsewhere if we initialize the FreqDist() object inside the function (in the same place it is populated), and if we moved the selection and display of results to the calling program. Given that its task is to identify frequent words, it should probably just return a list, not the whole frequency distribution. In 4.4 we refactor this function, and simplify its interface by dropping the freqdist parameter.

In [167]:
from urllib import request
from bs4 import BeautifulSoup

def freq_words(url, n):
    html = request.urlopen(url).read().decode('utf8')
    text = BeautifulSoup(html, 'html.parser').get_text()
    fd = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(text))
    return [word for (word, _) in fd.most_common(n)]

In [168]:
freq_words(constitution, 30)

["''",
 ',',
 ':',
 ':1',
 ';',
 'the',
 '(',
 ')',
 '``',
 '{',
 '}',
 'of',
 '?',
 'url',
 'https',
 '@',
 'import',
 'qh4t45',
 "'",
 'archives',
 '#',
 'and',
 '.',
 '[',
 ']',
 'national',
 'a',
 'documents',
 'constitution',
 'founding']

In [None]:
# # The readability and usability of the freq_words function is improved.


In [None]:
# Exercise 5: Choose your own webpage( in html format) and use the above code to output the 20 most common words in this web page. Please get rid of stop words, numbers and punctuations

In [246]:
from urllib import request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

def freq_words(url, n):
    html = request.urlopen(url).read().decode('utf8')
    text = BeautifulSoup(html, 'html.parser').get_text()
    fd = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(text) if word.isalpha() and word.lower() not in stopwords.words('english'))
    return [word for (word, _) in fd.most_common(n)]

In [242]:
news="https://www.foxnews.com/politics/debate-trump-biden-presidential-karl-rove"

In [247]:
freq_words(news, 20)

['https',
 'false',
 'debate',
 'biden',
 'presidential',
 'trump',
 'rove',
 'first',
 'news',
 'props',
 'fox',
 'url',
 'true',
 'id',
 'quot',
 'title',
 'text',
 'president',
 'apos',
 'paragraph']

In [None]:
# Q and A and Take a Break


In [None]:
# 4.5 Doing More with Functions

In [None]:
# This section discusses more advanced features 

In [None]:
# Functions as Arguments

In [None]:
#So far the arguments we have passed into functions have been simple objects like strings, or structured objects like lists. Python also lets us pass a function as an argument to another function. Now we can abstract out the operation, and apply a different operation on the same data. As the following examples show, we can pass the built-in function len() or a user-defined function last_letter() as arguments to another function:



In [None]:
#  Now we can abstract out the operation, and apply a different operation on the same data. As the following examples show, we can pass the built-in function len() or a user-defined function last_letter() as arguments to another function:


In [169]:
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
        'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']

In [170]:
def extract_property(prop):
    return [prop(word) for word in sent]

In [171]:
extract_property(len)

[4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]

In [172]:
def last_letter(word):
    return word[-1]

In [173]:
extract_property(last_letter)

['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

In [None]:
# Python provides us with one more way to define functions as arguments to other functions, so-called lambda expressions. Supposing there was no need to use the above last_letter() function in multiple places, and thus no need to give it a name. We can equivalently write the following:



In [174]:
 extract_property(lambda w: len(w))

[4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]

In [175]:
 extract_property(lambda w: w[-1])

['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

In [None]:
# Accumulative Functions

In [None]:
#These functions start by initializing some storage, and iterate over input to build it up, before returning some final object (a large structure or aggregated result). A standard way to do this is to initialize an empty list, accumulate the material, then return the list, as shown in function search1() in 4.6.

In [176]:
import nltk

In [177]:
def search1(substring, words):
    result = []
    for word in words:
        if substring in word:
            result.append(word)
    return result

def search2(substring, words):
    for word in words:
        if substring in word:
            yield word

In [178]:
for item in search1('zz', nltk.corpus.brown.words()):
    print(item, end=" ")

Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazz

In [179]:
for item in search2('zz', nltk.corpus.brown.words()):
    print(item, end=" ")

Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazz

In [180]:
# The function search2() is a generator. The first time this function is called, it gets as far as the yield statement and pauses. The calling program gets the first word and does any necessary processing. Once the calling program is ready for another word, execution of the function is continued from where it stopped, until the next time it encounters a yield statement. This approach is typically more efficient, as the function only generates the data as it is required by the calling program, and does not need to allocate additional memory to store the output (cf. our discussion of generator expressions above).

In [181]:
# Higher-Order Functions

In [None]:
# Let's start by defining a function is_content_word() which checks whether a word is from the open class of content words. We use this function as the first parameter of filter(), which applies the function to each item in the sequence contained in its second parameter, and only retains the items for which the function returns True.

In [182]:
def is_content_word(word):
    return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']

In [183]:
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
      'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']

In [185]:
list(filter(is_content_word, sent))

['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

In [184]:
[w for w in sent if is_content_word(w)]

['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

In [None]:
# map()

In [None]:
# Another higher-order function is map(), which applies a function to every item in a sequence. It is a general version of the extract_property() function we saw in 4.5. Here is a simple way to find the average length of a sentence in the news section of the Brown Corpus, followed by an equivalent version with list comprehension calculation:

In [186]:
import nltk

In [187]:
lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))

In [188]:
lengths[0:10]

[25, 43, 35, 37, 24, 24, 43, 2, 26, 25]

In [189]:
sum(lengths) / len(lengths)

21.75081116158339

In [190]:
lengths = [len(sent) for sent in nltk.corpus.brown.sents(categories='news')]

In [191]:
sum(lengths) / len(lengths)

21.75081116158339

In [None]:
# Name Arguments

In [None]:
# When there are a lot of parameters it is easy to get confused about the correct order. Instead we can refer to parameters by name, and even assign them a default value just in case one was not provided by the calling program. Now the parameters can be specified in any order, and can be omitted.

In [192]:
def repeat(msg='<empty>', num=1):
    return msg * num

In [193]:
repeat(num=3)

'<empty><empty><empty>'

In [194]:
repeat(msg='Alice')

'Alice'

In [195]:
repeat(num=5, msg='Alice')

'AliceAliceAliceAliceAlice'

In [None]:
# These are called keyword arguments. If we mix these two kinds of parameters, then we must ensure that the unnamed parameters precede the named ones. It has to be this way, since unnamed parameters are defined by position. We can define a function that takes an arbitrary number of unnamed and named parameters, and access them via an in-place list of arguments *args and an "in-place dictionary" of keyword arguments **kwargs

In [196]:
def generic(*args, **kwargs):
    print(args)
    print(kwargs)

In [197]:
generic(1, "African swallow","test", monty="python")

(1, 'African swallow', 'test')
{'monty': 'python'}


In [198]:
def any_sum(*args):
    return sum(args)

In [199]:
any_sum(1,4,5,6,9,99)

124

In [200]:
def any_sum(*num):
    return sum(num)

In [None]:
# when to use the arbitrary number of keyword arguments, just give you a simple example

In [201]:
def third_party_order_function(name, number, location):
    return f"{name} ordered {number} items for the store in {location}."

In [202]:
def third_party_order_function(name, number, location):
    return "{} ordered {} items for the store in {}.".format(name, number, location)

In [203]:
third_party_order_function('John', 3, 'NYC')


'John ordered 3 items for the store in NYC.'

In [204]:
def my_order_function(date, **kwargs):
    return f"Placed order on {date}: " + third_party_order_function(**kwargs)

In [205]:
my_order_function('2020-09', name='Alice', number=5, location='Chicago')

'Placed order on 2020-09: Alice ordered 5 items for the store in Chicago.'

In [206]:
#  Here's another illustration of this aspect of Python syntax, for the zip() function which operates on a variable number of arguments. We'll use the variable name *song to demonstrate that there's nothing special about the name *args.



In [207]:
song = [['four', 'calling', 'birds'],
       ['three', 'French', 'hens'],
       ['two', 'turtle', 'doves']]

In [208]:
list(zip(song[0], song[1], song[2]))

[('four', 'three', 'two'),
 ('calling', 'French', 'turtle'),
 ('birds', 'hens', 'doves')]

In [209]:
list(zip(*song))

[('four', 'three', 'two'),
 ('calling', 'French', 'turtle'),
 ('birds', 'hens', 'doves')]

In [None]:
# It should be clear from the above example that typing *song is just a convenient shorthand, and equivalent to typing out song[0], song[1], song[2].


In [None]:
# Here's another example of the use of keyword arguments in a function definition, along with three equivalent ways to call the function:


In [248]:
def freq_words(file, min=1, num=10):
    text = open(file).read()
    tokens = nltk.word_tokenize(text)
    freqdist = nltk.FreqDist(t for t in tokens if len(t) >= min)
    return freqdist.most_common(num)

In [249]:
fw = freq_words('document.txt', 4, 10)

In [250]:
fw

[('flies', 2),
 ('like', 2),
 ('Time', 1),
 ('arrow', 1),
 ('Fruit', 1),
 ('banana', 1),
 ('What', 1),
 ('doing', 1),
 ('late', 1),
 ('Which', 1)]

In [251]:
fw = freq_words('document.txt', min=4, num=10)

In [252]:
fw

[('flies', 2),
 ('like', 2),
 ('Time', 1),
 ('arrow', 1),
 ('Fruit', 1),
 ('banana', 1),
 ('What', 1),
 ('doing', 1),
 ('late', 1),
 ('Which', 1)]

In [253]:
fw=freq_words('document.txt', num=10, min=4)

In [254]:
fw

[('flies', 2),
 ('like', 2),
 ('Time', 1),
 ('arrow', 1),
 ('Fruit', 1),
 ('banana', 1),
 ('What', 1),
 ('doing', 1),
 ('late', 1),
 ('Which', 1)]

In [None]:
# 4.6  A Sample of Python Libraries

In [None]:
#Python has hundreds of third-party libraries, specialized software packages that extend the functionality of Python. NLTK is one such library. To realize the full power of Python programming, you should become familiar with several other libraries. Most of these will need to be manually installed on your computer.

In [None]:
# csv

In [None]:
#We can use Python's CSV library to read and write files stored in this format. For example, we can open a CSV file called lexicon.csv and iterate over its rows

In [214]:
import csv

In [215]:
input_file = open("lexicon.csv", "r")

In [216]:
for row in csv.reader(input_file): 
    print(row)

['sleep\tsli:p\tv.i\ta condition of body and mind…']
['walk\two:k\tv.intr\tprogress by lifting and setting down each foot…']
['wake\tweik\tintrans\tcease to sleep']


In [None]:
# Each row is just a list of strings. If any fields contain numerical data, they will appear as strings, and will have to be converted using int() or float().

In [None]:
# NumPy (See Slides)

In [None]:
# The NumPy package provides substantial support for numerical processing in Python. NumPy has a multi-dimensional array object, which is easy to initialize and access:

In [217]:
from numpy import array

In [218]:
# The NumPy package provides substantial support for numerical processing in Python. NumPy has a multi-dimensional array object, which is easy to initialize and access:

In [219]:
cube = array([ [[0,0,0], [1,1,1], [2,2,2]],
                [[3,3,3], [4,4,4], [5,5,5]],
                  [[6,6,6], [7,7,7], [8,8,8]] ])

In [220]:
cube[1,1,1]

4

In [221]:
cube[2].transpose()

array([[6, 7, 8],
       [6, 7, 8],
       [6, 7, 8]])

In [255]:
cube[2].T

array([[6, 7, 8],
       [6, 7, 8],
       [6, 7, 8]])

In [222]:
cube[2,1:]

array([[7, 7, 7],
       [8, 8, 8]])