In [2]:
t = 'walk', 'fem', 3
t

('walk', 'fem', 3)

In [3]:
t[0]

'walk'

In [4]:
t[1:]

('fem', 3)

In [5]:
len(t)

3

In [6]:
raw = 'I turned off the spectroroute'
text = ['I', 'turned', 'off', 'the', 'spectroroute']
pair = (6, 'turned')
raw[2], text[3], pair[1]

('t', 'the', 'turned')

In [7]:
raw[-3:], text[-3:], pair[-3:]

('ute', ['off', 'the', 'spectroroute'], (6, 'turned'))

In [8]:
len(raw), len(text), len(pair)

(29, 5, 2)

## Operating on Sequence Types

In [12]:
import nltk
from nltk import word_tokenize

raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'
text = word_tokenize(raw)
fdist = nltk.FreqDist(text)
sorted(fdist)
# FreqDist converted into a list, using sorted

[',', '.', 'Red', 'lorry', 'red', 'yellow']

In [13]:
for key in fdist:
    print(key + ':', fdist[key], end='; ')

Red: 1; lorry: 4; ,: 3; yellow: 2; red: 1; .: 1; 

In [14]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
words

['I', 'turned', 'the', 'spectroroute', 'off']

In [15]:
tmp = words[2]
words[2] = words[3]
words[3] = words[4]
words[4] = tmp

In [16]:
words

['I', 'turned', 'spectroroute', 'off', 'the']

In [17]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
zip(words, tags)

<zip at 0x106a8f348>

In [18]:
list(zip(words, tags))

[('I', 'noun'),
 ('turned', 'verb'),
 ('off', 'prep'),
 ('the', 'det'),
 ('spectroroute', 'noun')]

In [19]:
list(enumerate(words))

[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

In [20]:
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
text == training_data + test_data

True

In [21]:
len(training_data) / len(test_data)

9.0

## Combining Different Sequence Types

In [22]:
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]
# tuple (len(word), word)
wordlens.sort()
' '.join(w for (_, w) in wordlens)

'I off the turned spectroroute'

In [29]:
lexicon = [
    ('the', 'det', ['Di:', 'D@']),
    ('off', 'prep', ['Qf', 'O:f'])
]
# list, because it is a collection of objects of a single type

In [26]:
# position not significant -> lists

In [30]:
#lists are mutable
lexicon.sort()
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
del lexicon[0]
#tuples are inmutables

In [31]:
lexicon = tuple(lexicon)
lexicon[1] = ('turned', 'VBD', ['t3:nd', 't3`nd'])
del lexicon[0]

TypeError: 'tuple' object does not support item assignment

## Generator Expressions

In [33]:
text = '''"When I use a word," Humpty Dumpty said in rather a scornful tone,
"it means just what I choose it to mean - neither more nor less."'''
[w.lower() for w in word_tokenize(text)]

['``',
 'when',
 'i',
 'use',
 'a',
 'word',
 ',',
 "''",
 'humpty',
 'dumpty',
 'said',
 'in',
 'rather',
 'a',
 'scornful',
 'tone',
 ',',
 "''",
 'it',
 'means',
 'just',
 'what',
 'i',
 'choose',
 'it',
 'to',
 'mean',
 '-',
 'neither',
 'more',
 'nor',
 'less',
 '.',
 "''"]

In [34]:
max([w.lower() for w in word_tokenize(text)])
# storage is necessary for the list, before max is called

'word'

In [35]:
max(w.lower() for w in word_tokenize(text))
# the data is streamed to the calling function

'word'

In [36]:
min(w.lower() for w in word_tokenize(text))

"''"