In [1]:
import nltk

In [2]:
from nltk.corpus import brown

In [7]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features


In [8]:
sentence0 = brown.sents()[0]

In [9]:
sentence0

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [10]:
sentence0[8]

'investigation'

In [11]:
pos_features(sentence0, 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [12]:
tagged_sents = brown.tagged_sents(categories='news')

In [13]:
tag_sent0 = tagged_sents[0]

In [14]:
nltk.tag.untag(tag_sent0)

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [16]:
newssent = brown.sents(categories = 'news')

In [18]:
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)

0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [19]:
featuresets = []

In [20]:
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )


In [21]:
for f in featuresets[:10]:
	print (f)


({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [22]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]


In [23]:
len(train_set)

90499

In [24]:
len(test_set)

10055

In [25]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [26]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [27]:
sents = nltk.corpus.treebank_raw.sents()

In [28]:
sents[:10]

[['.', 'START'],
 ['Pierre',
  'Vinken',
  ',',
  '61',
  'years',
  'old',
  ',',
  'will',
  'join',
  'the',
  'board',
  'as',
  'a',
  'nonexecutive',
  'director',
  'Nov',
  '.',
  '29',
  '.'],
 ['Mr',
  '.',
  'Vinken',
  'is',
  'chairman',
  'of',
  'Elsevier',
  'N',
  '.',
  'V',
  '.,',
  'the',
  'Dutch',
  'publishing',
  'group',
  '.'],
 ['.', 'START'],
 ['Rudolph',
  'Agnew',
  ',',
  '55',
  'years',
  'old',
  'and',
  'former',
  'chairman',
  'of',
  'Consolidated',
  'Gold',
  'Fields',
  'PLC',
  ',',
  'was',
  'named',
  'a',
  'nonexecutive',
  'director',
  'of',
  'this',
  'British',
  'industrial',
  'conglomerate',
  '.'],
 ['.', 'START'],
 ['A',
  'form',
  'of',
  'asbestos',
  'once',
  'used',
  'to',
  'make',
  'Kent',
  'cigarette',
  'filters',
  'has',
  'caused',
  'a',
  'high',
  'percentage',
  'of',
  'cancer',
  'deaths',
  'among',
  'a',
  'group',
  'of',
  'workers',
  'exposed',
  'to',
  'it',
  'more',
  'than',
  '30',
  'years',


In [29]:
len(sents)

4193

In [30]:
for sent in sents[:10]:
    print (sent)

['.', 'START']
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.']
['Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START']
['Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.']
['.', 'START']
['A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.']
['The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even'

In [31]:
tokens = [ ]

In [32]:
boundaries = set()

In [33]:
offset = 0

In [34]:
for sent in nltk.corpus.treebank_raw.sents():
     tokens.extend(sent)
     offset += len(sent)
     boundaries.add(offset - 1)
    

In [35]:
tokens[:40]

['.',
 'START',
 'Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov',
 '.',
 '29',
 '.',
 'Mr',
 '.',
 'Vinken',
 'is',
 'chairman',
 'of',
 'Elsevier',
 'N',
 '.',
 'V',
 '.,',
 'the',
 'Dutch',
 'publishing',
 'group',
 '.',
 '.',
 'START',
 'Rudolph']

In [36]:
19 in boundaries

False

In [37]:
20 in boundaries

True

In [38]:
for num, tok in enumerate(tokens[:40]):
     print (num, tok, '\t', num in boundaries)


0 . 	 False
1 START 	 True
2 Pierre 	 False
3 Vinken 	 False
4 , 	 False
5 61 	 False
6 years 	 False
7 old 	 False
8 , 	 False
9 will 	 False
10 join 	 False
11 the 	 False
12 board 	 False
13 as 	 False
14 a 	 False
15 nonexecutive 	 False
16 director 	 False
17 Nov 	 False
18 . 	 False
19 29 	 False
20 . 	 True
21 Mr 	 False
22 . 	 False
23 Vinken 	 False
24 is 	 False
25 chairman 	 False
26 of 	 False
27 Elsevier 	 False
28 N 	 False
29 . 	 False
30 V 	 False
31 ., 	 False
32 the 	 False
33 Dutch 	 False
34 publishing 	 False
35 group 	 False
36 . 	 True
37 . 	 False
38 START 	 True
39 Rudolph 	 False


In [39]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
              'prevword': tokens[i-1].lower(),
             'punct': tokens[i],
             'prev-word-is-one-char': len(tokens[i-1]) == 1}


In [40]:
tokens[20]

'.'

In [41]:
punct_features(tokens,20)

{'next-word-capitalized': True,
 'prev-word-is-one-char': False,
 'prevword': '29',
 'punct': '.'}

In [42]:
Sfeaturesets = [(punct_features(tokens, i), (i in boundaries))
      for i in range(1, len(tokens) - 1)
      if tokens[i] in '.?!']


In [43]:
size = int(len(Sfeaturesets) * 0.1)

In [44]:
Strain_set, Stest_set = Sfeaturesets[size:], Sfeaturesets[:size]

In [45]:
Sclassifier = nltk.NaiveBayesClassifier.train(Strain_set)

In [46]:
nltk.classify.accuracy(Sclassifier, Stest_set)

0.936026936026936

In [51]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and Sclassifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
        if start < len(words):
            sents.append(words[start:])
    return sents

In [52]:
len(tokens)

101797

In [60]:
smalltokens = tokens[:500]

In [61]:
for s in segment_sentences(smalltokens):
    print (s)

['.']
['START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', '.', 'START', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', '.', 'START', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.', 'The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'bri

In [56]:
from nltk.tokenize import sent_tokenize

In [58]:
rawtext = 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.  Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.'

In [59]:
sents = nltk.sent_tokenize(rawtext)
sents


['Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.',
 'Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.']