# Using NLTK Toolbox for analyzing P&P

In [1]:
import matplotlib as mpl
import numpy as np
import nltk
import requests

Read txt for Pride and Prejudice

In [2]:
url = "http://www.gutenberg.org/files/1342/1342.txt"

request = requests.get(url)

pnp = request.content

Tokenize words 

In [3]:
from nltk import word_tokenize

word_tokens = word_tokenize(pnp)

num_words = len(word_tokens)

print("Number of words:", num_words)

('Number of words:', 146827)


In [7]:
from nltk import FreqDist
fdist = FreqDist(word_tokens[0])
print word_tokens[]
print fdist.Nr

<bound method FreqDist.Nr of FreqDist({'h': 1, 'e': 1, 'T': 1})>


Tokenize sentences

In [22]:
from nltk import sent_tokenize

sent_tokens = sent_tokenize(pnp)

num_sents = len(sent_tokens)

print("Number of sentences:", num_sents)

('Number of sentences:', 6094)


Create a concordance

In [36]:
from nltk import Text

# what is this for? not related to input text
from nltk.corpus import brown

# what is this for?
tokens = brown.words('cg13')

text = Text(tokens)

# lot of words show no matches
concordance = text.concordance("Zen", lines=30)


Displaying 2 of 2 matches:
ddition , they have been converted to Zen Buddhism , with its glorification of 
re after is the beatific vision . And Zen Buddhism , though it is extremely dif


(2311, 2311)

Contextual similarity

In [1]:
from nltk import Text

from nltk.corpus import genesis

tokens = genesis.words('english-web.txt')

text = Text(tokens)

print("Words that occur in contexts similar to the contexts 'fought' occurs in:")

text.similar("fought")

Words that occur in contexts similar to the contexts 'fought' occurs in:
lain fared been sex


Trial with P&P

In [14]:
from nltk import Text

from nltk import word_tokenize

word_tokens = word_tokenize(pnp)

text = Text(word_tokens)

print("Words that occur in contexts similar to the contexts 'prejudice' occurs in:")

text.similar("prejudice")


Words that occur in contexts similar to the contexts 'prejudice' occurs in:
when though unshackled and absurdities livings is distribute sisters
at in happiness gratitude despised from make since had anyone till


Semantic Similarity

In [17]:
from nltk.corpus import wordnet

bible = wordnet.synset('bible.n.01')

book = wordnet.synset('book.n.01')

scroll = wordnet.synset('scroll.n.02')

scroll_bible = scroll.lowest_common_hypernyms(bible)

book_bible = book.lowest_common_hypernyms(bible)

print("Lowest common hypernym for scroll and bible:", scroll_bible)

print("Lowest common hypernym for book and bible:", book_bible)

('Lowest common hypernym for scroll and bible:', [Synset('writing.n.02')])
('Lowest common hypernym for book and bible:', [Synset('entity.n.01')])


Part of Speech Tagger

In [32]:
from nltk import pos_tag, word_tokenize, sent_tokenize

sent_tokens = sent_tokenize(pnp)

sentence = sent_tokens[1]

tokens = word_tokenize(sentence)

tagged_tokens = pos_tag(tokens)

print('Tokens tagged with part of speech:', tagged_tokens)

('Tokens tagged with part of speech:', [('You', 'PRP'), ('may', 'MD'), ('copy', 'VB'), ('it', 'PRP'), (',', ','), ('give', 'VB'), ('it', 'PRP'), ('away', 'RB'), ('or', 'CC'), ('re-use', 'VB'), ('it', 'PRP'), ('under', 'IN'), ('the', 'DT'), ('terms', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Project', 'NNP'), ('Gutenberg', 'NNP'), ('License', 'NNP'), ('included', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('eBook', 'NN'), ('or', 'CC'), ('online', 'NN'), ('at', 'IN'), ('www.gutenberg.org', 'JJ'), ('Title', 'NN'), (':', ':'), ('Pride', 'NN'), ('and', 'CC'), ('Prejudice', 'NNP'), ('Author', 'NNP'), (':', ':'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('Posting', 'VBG'), ('Date', 'NNP'), (':', ':'), ('August', 'NNP'), ('26', 'CD'), (',', ','), ('2008', 'CD'), ('[', 'NNP'), ('EBook', 'NNP'), ('#', '#'), ('1342', 'CD'), (']', 'NN'), ('Release', 'NNP'), ('Date', 'NNP'), (':', ':'), ('June', 'NNP'), (',', ','), ('1998', 'CD'), ('Last', 'JJ'), ('updated', 'JJ'), (':', ':'), ('February', 'NNP'), ('15', 'CD'

Propositional Logic

In [33]:
from nltk import ResolutionProver

from nltk.sem import Expression

read_expr = Expression.fromstring

assumpt1 = read_expr('man(socrates)')                # socrates is a man

assumpt2 = read_expr('all x.(man(x) -> mortal(x))')  # for all x, if x is man, x is mortal

goal = read_expr('mortal(socrates)')                 # socrates is mortal

resolution = ResolutionProver().prove(goal, [assumpt1, assumpt2], verbose=True)

print("Socrates is mortal:", resolution)

[1] {-mortal(socrates)}     A 
[2] {man(socrates)}         A 
[3] {-man(z2), mortal(z2)}  A 
[4] {-man(socrates)}        (1, 3) 
[5] {mortal(socrates)}      (2, 3) 
[6] {}                      (1, 5) 

('Socrates is mortal:', True)


Chunking noun phrases

In [34]:
from nltk import pos_tag, RegexpParser, word_tokenize

# optional determiner (DT) followed by 0 or more adjectives (JJ) and then a noun (NN)

grammar = "NP: {<DT>?<JJ>*<NN>}"

# tag tokens

sentence = "Better is a poor but wise youth than an old but foolish king."

tokens = word_tokenize(sentence)

tagged_tokens = pos_tag(tokens)

# now chunk

chunk_parser = RegexpParser(grammar)

parsed = chunk_parser.parse(tagged_tokens)

# graph the results

parsed.draw()

Dispersion Plot

In [None]:
import nltk

from nltk import Text

from nltk.corpus import genesis

tokens = genesis.words('english-kjv.txt')

text = Text(tokens)

text.dispersion_plot(["God", "man", "woman"])

# From NLTK Book


In [1]:
import nltk
import requests

# text for finding frequency
url = "http://www.gutenberg.org/files/1342/1342.txt"
request = requests.get(url)
pnp = unicode(request.content)

emma = nltk.corpus.gutenberg.words('austen-emma.txt')
sns = nltk.corpus.gutenberg.words('austen-sense.txt')
pers = nltk.corpus.gutenberg.words('austen-persuasion.txt')

austen_text = emma+sns+pers
austen_text

from nltk import Text

austen = Text(austen_text)
words = austen.similar('plain')

idx = nltk.text.ContextIndex([word.lower( ) for word in nltk.corpus.gutenberg.words('austen-sense.txt')])
save = [ ]
for word in nltk.word_tokenize("plain"):
    save.append(idx.similar_words(word))


nltk.pos_tag(nltk.word_tokenize(str(save)))

# POS tagging
austenPOS = nltk.pos_tag(austen_text)

austenPOS[2][1]
content = [w for w in austenPOS if w[1] == 'JJ']


str(content[0][0])
austen = Text(austen_text)

# all kinds of troubleshooting, primarily stop words

from nltk import Text

austen = Text(austen_text)

from __future__ import division
from nltk.corpus import stopwords

stopwords = nltk.corpus.stopwords.words('english')
content = [w for w in austen if w.lower() not in stopwords]

raw = response.read().decode('utf8')

sentence = word_tokenize('try something different')
nltk.pos_tag(sentence)

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


<Text: Sense and Sensibility by Jane Austen 1811>