In [1]:
# Sentence Detection with Spacy

import spacy

nlp = spacy.load('en_core_web_sm')

In [21]:
text = "Imagine this is a long sentence. Now imagine this is a huge sentence as well. Now, this will be even bigger."
doc = nlp(text)

def get_sentences(doc):
    return doc.sents

def show_sentences(doc):
    sentences = get_sentences(doc)
    for idx, sentnc in enumerate(sentences):
        print(f"{idx}: {sentnc}")

In [7]:
sentences = get_sentences(doc)  # Give us a generator sentence
print(sentences)
print()
# how to handle generators? We have several options
# 1. Using for loops
# 2. Getting its values through a while loop using next() built-in function

show_sentences(doc)

<generator object at 0x7fbd3054c440>

Imagine this is a long sentence.
Now imagine this is a huge sentence as well.
Now, this will be even bigger.


In [8]:
# Sentences are not indexable as generators are not subscriptables
sentences[0]

TypeError: 'generator' object is not subscriptable

In [None]:
# Detecting and implementing a custom sentence segmentator

In [26]:
text = """ Let's say this a really
long text with a lot of words---As you have noticed,
the text have a
weird scenario at 
the end of
a sentence---What can we do in order to split the sentence succesfully?---let's gonna do it with spacy---but wait..., """
doc = nlp(text)
show_sentences(doc)

0:  Let's say this a really
long text with a lot of words---As you have noticed,
the text have a
weird scenario at 
the end of
a sentence
1: ---What can we do in order to split the sentence succesfully?---
2: let's gonna do it with spacy---but wait...,


In [None]:
# Custom segmentator
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == '---':
            doc[token.i+1].is_sent_start = True
    return doc