## 2.1 Conditions and Events

In [None]:
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [2]:
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]
# each pair has the form (condition, event)

## 2.2 Counting Words by Genre

In [3]:
import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

In [4]:
genre_word = [(genre, word)
              for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
len(genre_word)

170576

In [5]:
genre_word[:4]

[('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]

In [6]:
genre_word[-4:]

[('romance', 'afraid'),
 ('romance', 'not'),
 ('romance', "''"),
 ('romance', '.')]

In [7]:
cfd = nltk.ConditionalFreqDist(genre_word)
cfd

<ConditionalFreqDist with 2 conditions>

In [8]:
cfd.conditions()

['news', 'romance']

In [9]:
print(cfd['news'])

<FreqDist with 14394 samples and 100554 outcomes>


In [10]:
print(cfd['romance'])

<FreqDist with 8452 samples and 70022 outcomes>


In [11]:
cfd['romance'].most_common(20)

[(',', 3899),
 ('.', 3736),
 ('the', 2758),
 ('and', 1776),
 ('to', 1502),
 ('a', 1335),
 ('of', 1186),
 ('``', 1045),
 ("''", 1044),
 ('was', 993),
 ('I', 951),
 ('in', 875),
 ('he', 702),
 ('had', 692),
 ('?', 690),
 ('her', 651),
 ('that', 583),
 ('it', 573),
 ('his', 559),
 ('she', 496)]

In [12]:
cfd['romance']['could']

193

## 2.3 Plotting and Tabulating Distributions

In [13]:
from nltk.corpus import inaugural
icfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))

In [14]:
icfd.conditions()

['america', 'citizen']

In [15]:
years = []
for fileid in inaugural.fileids():
    years.append(fileid[:4])
#print(years)
last10 = years[-10:]
print(last10)

['1973', '1977', '1981', '1985', '1989', '1993', '1997', '2001', '2005', '2009']


In [16]:
#icfd.tabulate()
icfd.tabulate(samples=last10)

        1973 1977 1981 1985 1989 1993 1997 2001 2005 2009 
america   23    5   16   21   11   33   31   20   30   15 
citizen    1    0    3    6    3    2   10   11    7    2 


In [17]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch',
    'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
ucfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))

In [18]:
ucfd.tabulate(conditions=['English', 'German_Deutsch'],
             samples=range(10), cumulative=True)

                  0    1    2    3    4    5    6    7    8    9 
       English    0  185  525  883  997 1166 1283 1440 1558 1638 
German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 


In [19]:
cfd.conditions()

['news', 'romance']

## 2.4 Generating Random Text with Bigrams

In [20]:
sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
        'and', 'the', 'earth', '.']
list(nltk.bigrams(sent))

[('In', 'the'),
 ('the', 'beginning'),
 ('beginning', 'God'),
 ('God', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the'),
 ('the', 'earth'),
 ('earth', '.')]

In [21]:
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()

text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

In [22]:
cfd['living']

FreqDist({'creature': 7, 'thing': 4, 'substance': 2, '.': 1, ',': 1, 'soul': 1})

In [23]:
cfd['creature']

FreqDist({'that': 4, 'of': 2, ',': 1, 'after': 1})

In [24]:
cfd['thing']

FreqDist({'that': 14, ',': 7, 'which': 3, 'was': 2, 'of': 2, 'living': 1, 'from': 1, '?': 1, 'is': 1, 'ought': 1, ...})

In [25]:
generate_model(cfd, 'living')

living creature that he said , and the land of the land of the land 