In [None]:
import nltk

## Tokenize

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome.\
                The sky is pinkish-blue. You shouldn't eat cardboard."
print(sent_tokenize(EXAMPLE_TEXT))

In [26]:
nltk.word_tokenize("Tokenize me")

['Tokenize', 'me']

In [27]:
for sec in sent_tokenize(EXAMPLE_TEXT):
    print( word_tokenize(sec))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?']
['The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.']
['The', 'sky', 'is', 'pinkish-blue', '.']
['You', 'should', "n't", 'eat', 'cardboard', '.']


## Stopwords

In [28]:
from nltk.corpus import stopwords

In [29]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [30]:
a = "this is This Is my MY python test. Is it good"
filtered_sent = []
print(sent_tokenize(a))
for word in word_tokenize(a):
    if word not in set(stopwords.words("english")):
        filtered_sent.append(word)
print(filtered_sent)

['this is This Is my MY python test.', 'Is it good']
['This', 'Is', 'MY', 'python', 'test', '.', 'Is', 'good']


## Stemming

In [31]:
from nltk.stem import PorterStemmer

In [32]:
ps = PorterStemmer()

In [38]:
a = ["pyt","pythono", "pythonly","pyth","pythoner","pythoning"]
for i in a:
    print (ps.stem(i))

pyt
pythono
pythonli
pyth
python
python


In [39]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [46]:
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w), end = " ")

It is import to by veri pythonli while you are python with python . all python have python poorli at least onc . 

In [41]:
from nltk.stem import SnowballStemmer

In [48]:
ss = SnowballStemmer("english")
words = word_tokenize(new_text)
for w in words:
    print(ss.stem(w), end = " ")

it is import to by veri python while you are python with python . all python have python poor at least onc . 

## Speech Tagging 

In [49]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer

In [61]:
sample_text = "hello, i am Eric. I like to do something. It is raining now."
tokenized = PunktSentenceTokenizer().tokenize(sample_text)
print(tokenized)
print
for i in tokenized[:5]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        print(tagged)

['hello, i am Eric.', 'I like to do something.', 'It is raining now.']
[('hello', 'NN'), (',', ','), ('i', 'NN'), ('am', 'VBP'), ('Eric', 'NNP'), ('.', '.')]
[('I', 'PRP'), ('like', 'VBP'), ('to', 'TO'), ('do', 'VB'), ('something', 'NN'), ('.', '.')]
[('It', 'PRP'), ('is', 'VBZ'), ('raining', 'VBG'), ('now', 'RB'), ('.', '.')]


In [68]:
print(nltk.pos_tag(["hello", "this", "is", "Bob", "runing", "green", "larger","the","grass","is","green"]))

[('hello', 'NN'), ('this', 'DT'), ('is', 'VBZ'), ('Bob', 'NNP'), ('runing', 'VBG'), ('green', 'JJ'), ('larger', 'JJR'), ('the', 'DT'), ('grass', 'NN'), ('is', 'VBZ'), ('green', 'JJ')]


## Chunking with NLTK(分块)

http://cpmarkchang.logdown.com/posts/197448-python-nltk-rule-based-chunking

## Chinking

http://cpmarkchang.logdown.com/posts/197448-python-nltk-rule-based-chunking

## Named Entity Recognition 

In [71]:
sentence = "today is Sunday. I am at Syracuse, NY, USA. Time is 2:00pm. Income is 175 million Dollars, GBP 10.40."
sentence = sent_tokenize(sentence)
print(sentence)
for i in sentence:
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    namedEnt.draw()

['today is Sunday.', 'I am at Syracuse, NY, USA.', 'Time is 2:00pm.', 'Income is 175 million Dollars, GBP 10.40.']


## Lemmatizing

In [79]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
print(lemmatizer.lemmatize("swimming",'v'))
print(lemmatizer.lemmatize("dragon eggs"))

cat
cactus
goose
rock
python
good
best
run
run
swim
dragon eggs


## NLTK Corpora

In [80]:
print(nltk.__file__) #find the nltk corpora location

E:\Anaconda\Anaconda4.0\envs\py36\lib\site-packages\nltk\__init__.py


In [81]:
from nltk.corpus import gutenberg #a collection of out-of-copy-right books

# sample text
sample = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample)

for x in range(5):
    print(tok[x])

[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.
And the Spirit of God moved upon the face of the
waters.
1:3 And God said, Let there be light: and there was light.
1:4 And God saw the light, that it was good: and God divided the light
from the darkness.


## WordNet

In [82]:
from nltk.corpus import wordnet

In [83]:
syns = wordnet.synsets("program") #program的同义词
print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [84]:
print(syns[0])

Synset('plan.n.01')


In [86]:
print(syns[0].name())

plan.n.01


In [87]:
print(syns[0].lemmas()[0].name())

plan


In [88]:
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [89]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [90]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'dear', 'effective', 'beneficial', 'upright', 'secure', 'soundly', 'thoroughly', 'right', 'skillful', 'full', 'dependable', 'sound', 'well', 'expert', 'just', 'honorable', 'salutary', 'unspoilt', 'trade_good', 'practiced', 'ripe', 'undecomposed', 'proficient', 'in_force', 'serious', 'adept', 'goodness', 'skilful', 'good', 'commodity', 'respectable', 'near', 'in_effect', 'safe', 'honest', 'unspoiled', 'estimable'}
{'bad', 'badness', 'ill', 'evilness', 'evil'}


In [91]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [92]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


In [94]:
print(wordnet.synsets("ship"))

[Synset('ship.n.01'), Synset('transport.v.04'), Synset('ship.v.02'), Synset('embark.v.01'), Synset('ship.v.04'), Synset('ship.v.05')]


## Text Classification 

In [97]:
all_words = ["hello", "world", "hello"]
all_words = nltk.FreqDist(all_words)
all_words

FreqDist({'hello': 2, 'world': 1})

In [98]:
from nltk.corpus import movie_reviews

In [102]:
movie_reviews.fileids()[:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

In [103]:
id1 = movie_reviews.fileids()[0]

In [108]:
print(len(movie_reviews.words(id1)))
movie_reviews.words(id1)

879


['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [109]:
movie_reviews.categories()

['neg', 'pos']

In [115]:
all_words = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category)[:10]:
        all_words.append((list(movie_reviews.words(fileid)), category))


In [116]:
print(all_words[0])

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'b