In [2]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [4]:
#ITERATE THROUGH THE SENTENCES
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence


In [5]:
#GET A SENTENCE: span objects; not normal strings
list(doc.sents)[0]

This is the first sentence.

In [6]:
 #OWN PARTICULAR SEGMENTATION RULES
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [7]:
#DEFAULT SETENCE SEGMENTATION
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [28]:
##TWO WAYS
#ADD A SEGMENTATION RULE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

#CHANGE THE SEGMENTATION RULE

In [29]:
nlp.add_pipe(set_custom_boundaries,before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

['tagger', 'parser', 'ner']

In [14]:
#ALL TOKENS EXCEPT LAST ONE
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [30]:
#NEW DOCUMENT
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [31]:
#AFTER CHANGING THE PIPELINE: SEPARATES OON SEMI-COLON TOO
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [32]:
#CHANGE THE RULES COMPLETELY

#RELOAD: DEFAULTS
nlp = spacy.load('en_core_web_sm')

In [33]:
my_string = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [34]:
#DEFINE \N AS END OF SENTENCE; NOT .

#DEFAULT
doc = nlp(my_string)
for sent in doc.sents:
    
    #SEPARATES ON BOTH \N\N AND .
    print(sent)

This is a sentence.
This is another.


This is a 
third sentence.


In [35]:
#CHANGE THE RULES
from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
            
        elif word.text.startswith('\n'): #WHATEVER SEPARATOR YOU WANT
            seen_newline = True
            
    yield doc[start:]

In [36]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)


In [37]:
nlp.add_pipe(sbd)

In [39]:
doc = nlp(my_string)

In [40]:
#NO LONGER SPLITTING ON .
for sent in doc.sents:
    print(sent)

This is a sentence. This is another.


This is a 

third sentence.
