In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [3]:
# doc.sents is a generator

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [4]:
# cannot individually grab sentences using .sents
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [5]:
doc[0]

This

In [6]:
list(doc.sents)[0]

This is the first sentence.

In [7]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [8]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [9]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

sentence segmentation

In [10]:
for sent in doc.sents:
    print(sent)
    print("\n")

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




new rule to the pipeline

In [11]:
# ADD A SEGMENTATION RULE

def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print(token.i) # token.i -> get the index of the token

set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
things
13
.
14
"
15
-Peter
16
Drucker
17


In [12]:
from spacy.language import Language

In [13]:
# ADD A SEGMENTATION RULE
@Language.component("set_custom_boundaries")

def set_custom_boundaries(doc):
    for token in doc[:-1]: # go up to but not including the last token
        if token.text == ';':
            doc[token.i+1].is_sent_start = True

    return doc

In [14]:
nlp.add_pipe("set_custom_boundaries", before="parser")

nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [15]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [16]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [None]:
# CHANGE SEGMENTATION RULES

In [17]:
nlp = spacy.load("en_core_web_sm")

In [18]:
mystring = u"This is a sentence. This is another. \n\nThis is a \nthird sentence"

In [19]:
print(mystring)

This is a sentence. This is another. 

This is a 
third sentence


In [20]:
# line breaks as sents

doc = nlp(mystring)

In [21]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence


SentenceSegmenter is removed...

In [29]:
from spacy.pipeline import SentenceSegmenter

ImportError: cannot import name 'SentenceSegmenter' from 'spacy.pipeline' (C:\Users\ozret\AppData\Local\Programs\Python\Python310\lib\site-packages\spacy\pipeline\__init__.py)

In [27]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith("\n"):
            seen_newline = True
    
    yield doc[start:]

In [26]:
sbd = Sentencizer(nlp.vocab, strategy = split_on_newlines)

nlp.add_pipe(sbd)

doc = nlp(mystring)

for sentence in doc.sents:
    print(sentence)

TypeError: __init__() got an unexpected keyword argument 'strategy'