In [1]:
from nltk.corpus import gutenberg
gutenberg.fileids()



[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

In [2]:
emma = gutenberg.words('austen-emma.txt')
len(emma)

192427

In [7]:
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))       # access raw contents 
    num_words = len(gutenberg.words(fileid))     # access as a list of tokens 
    num_sents = len(gutenberg.sents(fileid)) # access as a list of sentences ( list os list of words)
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid 

4 24 26 austen-emma.txt
4 26 16 austen-persuasion.txt
4 28 22 austen-sense.txt
4 33 79 bible-kjv.txt
4 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 17 12 burgess-busterbrown.txt
4 20 12 carroll-alice.txt
4 20 11 chesterton-ball.txt
4 22 11 chesterton-brown.txt
4 18 10 chesterton-thursday.txt
4 20 24 edgeworth-parents.txt
4 25 15 melville-moby_dick.txt
4 52 10 milton-paradise.txt
4 11 8 shakespeare-caesar.txt
4 12 7 shakespeare-hamlet.txt
4 12 6 shakespeare-macbeth.txt
4 36 12 whitman-leaves.txt

## Conditional Distribution of words

In [8]:
import nltk
from nltk.corpus import inaugural

cfd = nltk.ConditionalFreqDist()
for fileid in inaugural.fileids():
    for w in inaugural.words(fileid):
        for target in ['america', 'citizen']:
            if w.lower().startswith(target):
                cfd[target][fileid[:4]] += 1

cfd.plot()

In [11]:
# Usando el inicializador y for-comprehensions
cfd = nltk.ConditionalFreqDist(
                 (target,fileid[:4])
                 for fileid in inaugural.fileids()
                 for w in inaugural.words(fileid)
                 for target in ['america', 'citizen']
                 if w.lower().startswith(target))
                
cfd.plot()

In [21]:
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word)
              for genre in ['news','romance']
              for word in brown.words(categories=genre))
cfd.conditions()

['romance', 'news']

In [22]:
print(cfd['news'])

<FreqDist with 14394 samples and 100554 outcomes>

In [24]:
cfd['romance'].most_common(20)

[(u',', 3899),
 (u'.', 3736),
 (u'the', 2758),
 (u'and', 1776),
 (u'to', 1502),
 (u'a', 1335),
 (u'of', 1186),
 (u'``', 1045),
 (u"''", 1044),
 (u'was', 993),
 (u'I', 951),
 (u'in', 875),
 (u'he', 702),
 (u'had', 692),
 (u'?', 690),
 (u'her', 651),
 (u'that', 583),
 (u'it', 573),
 (u'his', 559),
 (u'she', 496)]

In [31]:
cfd.tabulate(samples=['could', 'can', 'would'])

        could  can would 
   news   86   93  244 
romance  193   74  244 

# Generating Random Text with Bigrams

In [34]:
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
cfd['living']

FreqDist({u'creature': 7, u'thing': 4, u'substance': 2, u',': 1, u'.': 1, u'soul': 1})

In [39]:
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = cfdist[word].max()
generate_model(cfd,'living')

living creature that he said , and the land of the land of the land

# Stopwords and Lexicons

In [42]:
from nltk.corpus import stopwords
stopwords.words('english')[0:10]

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your']

# Wordnet

In [45]:
from nltk.corpus import wordnet as wn 
wn.synsets('motorcar')

[Synset('car.n.01')]

In [50]:
wn.synset('car.n.01').lemma_names()

[u'car', u'auto', u'automobile', u'machine', u'motorcar']

In [52]:
wn.synset('car.n.01').definition()

u'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [56]:
wn.synset('car.n.01').examples()

[u'he needs a car to get to work']

In [58]:
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [60]:
wn.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [64]:
for synset in wn.synsets('car'):
   print synset.lemma_names()

[u'car', u'auto', u'automobile', u'machine', u'motorcar']
[u'car', u'railcar', u'railway_car', u'railroad_car']
[u'car', u'gondola']
[u'car', u'elevator_car']
[u'cable_car', u'car']

## Lexical relations

In [69]:
wn.synset('car.n.01').hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [72]:
wn.synset('car.n.01').hypernyms()

[Synset('motor_vehicle.n.01')]

In [75]:
wn.synset('car.n.01').hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('container.n.01'),
  Synset('wheeled_vehicle.n.01'),
  Synset('self-propelled_vehicle.n.01'),
  Synset('motor_vehicle.n.01'),
  Synset('car.n.01')],
 [Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('conveyance.n.03'),
  Synset('vehicle.n.01'),
  Synset('wheeled_vehicle.n.01'),
  Synset('self-propelled_vehicle.n.01'),
  Synset('motor_vehicle.n.01'),
  Synset('car.n.01')]]

In [80]:
wn.synset('car.n.01').root_hypernyms()

[Synset('entity.n.01')]

In [82]:
wn.synset('tree.n.01').part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [84]:
wn.synset('tree.n.01').substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

In [86]:
wn.synset('tree.n.01').member_holonyms()

[Synset('forest.n.01')]

In [87]:
wn.synset('walk.v.01').entailments()

[Synset('step.v.01')]

In [90]:
wn.synset('tease.v.03').entailments()

[Synset('arouse.v.07'), Synset('disappoint.v.01')]

In [92]:
wn.lemma('horizontal.a.01.horizontal').antonyms()

[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]

# Semantic similarity

In [94]:
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')

In [97]:
right.lowest_common_hypernyms(minke)

[Synset('baleen_whale.n.01')]

In [100]:
right.lowest_common_hypernyms(tortoise)

[Synset('vertebrate.n.01')]

In [103]:
right.lowest_common_hypernyms(novel)

[Synset('entity.n.01')]

In [106]:
wn.synset('whale.n.01').min_depth()

5

In [109]:
right.path_similarity(minke)

0.25

In [110]:
right.path_similarity(tortoise)

0.07692307692307693