<a href="https://colab.research.google.com/github/yeesem/Natural-Laguage-Processing/blob/main/Sentence_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [96]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [97]:
for sent in doc.sents:
  print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [98]:
#Holding in memory
#Error will occur if doc.sents[0]
doc.sents

<generator at 0x7a67e212e0c0>

In [99]:
#Can pass the memory to the list to solve that issue
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [100]:
list(doc.sents)[0]

This is the first sentence.

In [101]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [102]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [103]:
#The separation of the sentences maybe right,maybe wrong
for sents in doc.sents:
  print(sents)
  print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [104]:
##CAN CUSTORMISE THE SENTENCES SEGMENTATION
##ADD A SEGMENTATION RULE
def set_custom_boundaries(doc):
  for token in doc:
    print(token)
    #Show the index
    print(token.i)

In [105]:
set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
things
13
.
14
"
15
-Peter
16
Drucker
17


In [106]:
#Loop until the second last word in the sentence
def set_custom_boundaries2(doc):
  for token in doc[:-1]:
    print(token.text)

In [107]:
set_custom_boundaries2(doc)

"
Management
is
doing
the
right
things
;
leadership
is
doing
the
right
things
.
"
-Peter


In [108]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." -Peter

In [109]:
from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  #Declaration of doc[:-1] is to prevent error
  #when the last word is ';'
  for token in doc[:-1]:
    if token.text == ';':
      doc[token.i+1].is_sent_start = True
  return doc

In [110]:
nlp.add_pipe("set_custom_boundaries", before='parser')

<function __main__.set_custom_boundaries(doc)>

In [111]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [112]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [113]:
for sent in doc4.sents:
  print(sent)
  print("\n")

"Management is doing the right things;


leadership is doing the right things."


-Peter Drucker




In [114]:
#CHANGE SEGMENTATION RULE
nlp = spacy.load('en_core_web_sm')

In [115]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [116]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [117]:
doc = nlp(mystring)

In [118]:
for sentence in doc.sents:
  print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [123]:
def split_on_newlines(doc):
  start = 0
  seen_newline = False

  for word in doc:
    if seen_newline:
      yield doc[start:word.i]
      start = word.i
      seen_newline = False
    elif word.text.startswith('\n'):
      seen_newline = True

  yield doc[start:]

In [127]:
nlp = spacy.load('en_core_web_sm',exclude = ['parser'])
config = {'punct_chars':['\n']}
nlp.add_pipe('sentencizer',config=config,before = 'attribute_ruler')

<spacy.pipeline.sentencizer.Sentencizer at 0x7a67e5341f00>

In [128]:
nlp.pipe_names

['tok2vec', 'tagger', 'sentencizer', 'attribute_ruler', 'lemmatizer', 'ner']

In [129]:
doc2 = nlp(mystring)
for sent in doc2.sents:
  print(sent)

This is a sentence. This is another.

This is a 

third sentence.
