In [9]:
#perform standard imports
import spacy
nlp = spacy.load('en_core_web_md')

## Sentence Segmentation

In [10]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [11]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [12]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [13]:
list(doc.sents)[0]

This is the first sentence.

In [14]:
doc_list = list(doc.sents)

In [15]:
type(doc_list)

list

In [17]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [18]:
doc2 = nlp(u'"Management is doing the right thing; leadership is doing the right thing." -Yogesh')

In [20]:
doc2.text

'"Management is doing the right thing; leadership is doing the right thing." -Yogesh'

In [22]:
for sent in doc2.sents:
    print(sent)
    print('\n')

"Management is doing the right thing; leadership is doing the right thing." -Yogesh




In [23]:
#add a segmentation rule

def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print(token.i) #index position

In [24]:
set_custom_boundaries(doc2)

"
0
Management
1
is
2
doing
3
the
4
right
5
thing
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
thing
13
.
14
"
15
-Yogesh
16


In [27]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i +1].is_sent_start = True
    return doc

In [28]:
doc2[:-1]

"Management is doing the right thing; leadership is doing the right thing."

In [29]:
nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [31]:
doc4 = nlp(u'"Management is doing the right thing; leadership is doing the right thing." -Yogesh Tak')

In [32]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right thing;
leadership is doing the right thing."
-Yogesh Tak


In [33]:
#CHANGE SEGMENTATION RULES

In [34]:
nlp = spacy.load('en_core_web_md')

In [35]:
mystring = u"This is a sentence. This is another sentence. \n\nThis is a \nthird senetence."

In [36]:
print(mystring)

This is a sentence. This is another sentence. 

This is a 
third senetence.


In [37]:
doc = nlp(mystring)

In [38]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another sentence. 


This is a 
third senetence.


In [39]:
from spacy.pipeline import SentenceSegmenter

In [40]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start= word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
            
    yield doc[start:]

In [43]:
sbd = SentenceSegmenter(nlp.vocab, strategy= split_on_newlines)

In [44]:
nlp.add_pipe(sbd)

In [46]:
doc = nlp(mystring)

In [47]:
for sent in doc.sents:
    print(sent)

This is a sentence. This is another sentence. 


This is a 

third senetence.
