# Functions in nltk

In [33]:
import nltk
from nltk import word_tokenize

## writing structured programs

#### assignment

In [2]:
# the "value" of a structured object such as a list is actually just a reference to the object.
foo = ['Monty', 'Python']
bar = foo
foo[1] = 'Bodkin'
bar

['Monty', 'Bodkin']

![screenshot_391.jpg](attachment:screenshot_391.jpg)

In [3]:
# The is operator tests for object identity, they are not only identical according to ==, but also that they are one and the same object
size = 5
python = ['Python']
snake_nest = [python] * size
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]

True

In [4]:
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]

True

In [5]:
import random
position = random.choice(range(size))
snake_nest[position] = ['Python']
snake_nest

[['Python'], ['Python'], ['Python'], ['Python'], ['Python']]

In [6]:
snake_nest[0] == snake_nest[1] == snake_nest[2] == snake_nest[3] == snake_nest[4]

True

In [7]:
snake_nest[0] is snake_nest[1] is snake_nest[2] is snake_nest[3] is snake_nest[4]

False

In [8]:
[id(snake) for snake in snake_nest]

[4373149640, 4373149640, 4373149640, 4371804360, 4373149640]

In [11]:
#  all() and any() can be applied to a list (or other sequence) to check whether all or any items meet some condition
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
all(len(w) > 4 for w in sent)

False

In [12]:
any(len(w) > 4 for w in sent)

True

#### sequence

In [25]:
t = 'walk', 'fem', 3
t

('walk', 'fem', 3)

In [24]:
# it is the comma, not the parentheses, that define the tuple
tSingle = 'run',
tSingle

('run',)

In [29]:
[i for i in reversed(t)]

[3, 'fem', 'walk']

In [34]:
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'
text = word_tokenize(raw)
fdist = nltk.FreqDist(text)
fdist


FreqDist({'lorry': 4, ',': 3, 'yellow': 2, 'Red': 1, 'red': 1, '.': 1})

In [35]:
sorted(fdist)

[',', '.', 'Red', 'lorry', 'red', 'yellow']

In [36]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
zip(words, tags)

<zip at 0x1a16ac1a88>

In [37]:
list(zip(words, tags))

[('I', 'noun'),
 ('turned', 'verb'),
 ('off', 'prep'),
 ('the', 'det'),
 ('spectroroute', 'noun')]

In [38]:
list(enumerate(words))

[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

In [39]:
# For some NLP tasks it is necessary to cut up a sequence into two or more parts. For instance, we might want to "train" a system on 90% of the data and test it on the remaining 10%. 
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
text == training_data + test_data

True

In [40]:
len(training_data) / len(test_data)

9.0

In [45]:
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
wordlens.sort()
# The underscore _ is just a regular Python variable, but we can use underscore by convention to indicate that we will not use its value
' '.join(w for (_, w) in wordlens)

'I off the turned spectroroute'

In [43]:
wordlens

[(1, 'I'), (3, 'off'), (3, 'the'), (6, 'turned'), (12, 'spectroroute')]

In [47]:
text = 'this is a test'
max([w.lower() for w in word_tokenize(text)])

'this'

In [48]:
# Python allows us to omit the bracket
## generator expression ： more efficient
# process the stream of data without having to store anything more than the maximum value seen so far
max(w.lower() for w in word_tokenize(text))

'this'

In [52]:
tokens = nltk.corpus.brown.words(categories='news')
# generator expression
total = sum(len(t) for t in tokens)
print(total / len(tokens))

4.401545438271973


In [53]:
word_list = []
word_list.insert(0, tokens[1])

In [54]:
word_list

['Fulton']