In [2]:
import nltk

In [2]:
# dictionary data type

# 3.1 Indexing Lists vs Dictionaries

# 3.2 Dictionaries in Python

In [3]:
pos = {}
pos

{}

In [4]:
pos['colorless'] = 'ADJ'
pos

{'colorless': 'ADJ'}

In [5]:
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'

In [6]:
pos

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

In [7]:
pos['ideas']

'N'

In [8]:
pos['colorless']

'ADJ'

In [9]:
pos['green']

KeyError: 'green'

In [10]:
list(pos)

['colorless', 'ideas', 'sleep', 'furiously']

In [11]:
sorted(pos)

['colorless', 'furiously', 'ideas', 'sleep']

In [12]:
[w for w in pos if w.endswith('s')]

['colorless', 'ideas']

In [13]:
for word in sorted(pos):
    print(word + ":", pos[word])

colorless: ADJ
furiously: ADV
ideas: N
sleep: V


In [14]:
pos['sleep'] = ['N', 'V']

In [15]:
for word in sorted(pos):
    print(word + ":", pos[word])

colorless: ADJ
furiously: ADV
ideas: N
sleep: ['N', 'V']


# 3.3 Defining Dictionaries

In [1]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}

In [2]:
pos

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

# 3.4 Default Dictionaries

In [3]:
from collections import defaultdict
frequency = defaultdict(int)
frequency['coloreless'] = 4
frequency['ideas']

0

In [5]:
pos = defaultdict(list)
pos['sleep'] = ['NOUN', 'VERB']
pos['ideas']

[]

In [6]:
pos = defaultdict(lambda: 'NOUN')
pos['colorless'] = 'ADJ'
pos['blog']

'NOUN'

In [7]:
list(pos.items())

[('colorless', 'ADJ'), ('blog', 'NOUN')]

In [10]:
import nltk
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = [word for (word, _) in vocab.most_common(1000)]
mapping = defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = v
    
alice2 = [mapping[v] for v in alice]
print(alice2[:100])

['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'by', 'UNK', 'UNK', 'UNK', 'UNK', 'CHAPTER', 'I', '.', 'Down', 'the', 'Rabbit', '-', 'UNK', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'UNK', 'in', 'it', ',', "'", 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', ",'", 'thought', 'Alice', "'", 'without', 'pictures', 'or', 'conversation', "?'", 'So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',']


In [11]:
len(set(alice2))

1001

In [12]:
len(v1000)

1000

In [13]:
len(set(alice))

3016

# 3.5 Incrementally Updating Dictionary

In [14]:
from collections import defaultdict
counts = defaultdict(int)
from nltk.corpus import brown
for (word, tag) in brown.tagged_words(categories='news', tagset='universal'):
    counts[tag] += 1
counts['NOUN']

30654

In [16]:
print(sorted(counts))

['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


In [17]:
from operator import itemgetter
print( sorted(counts.items(), key=itemgetter(1), reverse=True) )

[('NOUN', 30654), ('VERB', 14399), ('ADP', 12355), ('.', 11928), ('DET', 11389), ('ADJ', 6706), ('ADV', 3349), ('CONJ', 2717), ('PRON', 2535), ('PRT', 2264), ('NUM', 2166), ('X', 92)]


In [21]:
print( [t for t,c in sorted(counts.items(), key=itemgetter(1), reverse=True)] )

['NOUN', 'VERB', 'ADP', '.', 'DET', 'ADJ', 'ADV', 'CONJ', 'PRON', 'PRT', 'NUM', 'X']


In [22]:
last_letters = defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
    key = word[-2:]
    last_letters[key].append(word)
print(last_letters['ly'])



In [23]:
print(last_letters['sy'])

['acatalepsy', 'acatharsy', 'achromatopsy', 'adipsy', 'aerognosy', 'aeroscepsy', 'agnosy', 'ambassy', 'analepsy', 'androlepsy', 'angiotripsy', 'antonomasy', 'apepsy', 'apostasy', 'archheresy', 'archhypocrisy', 'argosy', 'astrognosy', 'athanasy', 'atrepsy', 'atresy', 'autocatalepsy', 'autonomasy', 'autopsy', 'backwoodsy', 'basiotripsy', 'bebusy', 'Bessy', 'Betsy', 'biopsy', 'birsy', 'boosy', 'bossy', 'bousy', 'brassy', 'bronchopleurisy', 'brosy', 'busy', 'canvassy', 'catalepsy', 'cephalotripsy', 'cheesy', 'cholecystolithotripsy', 'choledocholithotripsy', 'cholelithotripsy', 'choosy', 'Christmasy', 'chuprassy', 'circusy', 'classy', 'cleidotripsy', 'clerisy', 'clumsy', 'controversy', 'copsy', 'cosy', 'courtesy', 'craniognosy', 'creasy', 'creepmousy', 'cressy', 'cryptoheresy', 'curtesy', 'curtsy', 'cystectasy', 'daisy', 'dassy', 'dermatopsy', 'diabolepsy', 'dichromasy', 'dimpsy', 'discourtesy', 'docimasy', 'dreamsy', 'dressy', 'dropsy', 'drossy', 'drowsy', 'drusy', 'dyspepsy', 'easy', 'ecs

In [24]:
anagrams = defaultdict(list)
for word in words:
    key = ''.join(sorted(word))
    anagrams[key].append(word)

anagrams['aeilnrt']

['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

In [27]:
anagrams = nltk.Index( (''.join(sorted(w)), w) for w in words)
anagrams['aeilnrt']

['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

# 3.6 Complex Keys and Values

In [28]:
pos = defaultdict(lambda: defaultdict(int))
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
for ( (w1,t1), (w2, t2) ) in nltk.bigrams(brown_news_tagged):
    pos[(t1, w2)][t2] += 1 # tag and following word

pos[('DET', 'right')]

defaultdict(int, {'ADJ': 11, 'NOUN': 5})

# 3.7 Inverting a Dictionary

In [30]:
counts = defaultdict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
    counts[word] += 1
    
[key for (key, value) in counts.items() if value == 32]

['mortal',
 'Against',
 'Him',
 'There',
 'brought',
 'King',
 'virtue',
 'every',
 'been',
 'thine']

In [31]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos2 = dict( (value, key) for (key, value) in pos.items() )
pos2['N']

'ideas'

In [32]:
pos.update({'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'})
pos2 = defaultdict(list)
for key, value in pos.items():
    pos2[value].append(key)
    
pos2['ADV']

['furiously', 'peacefully']

In [33]:
pos2 = nltk.Index((value, key) for (key, value) in pos.items())
pos2['ADV']

['furiously', 'peacefully']