# POS Basics

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [3]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [4]:
def show_details(doc):
    for token in doc:
        print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

In [5]:
show_details(doc)

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [6]:
doc = nlp(u'I read books on NLP.')
r = doc[1]

show_details([r])

read       VERB     VBD    verb, past tense


In [7]:
doc = nlp(u'I read a book on NLP.')
r = doc[1]

show_details([r])

read       VERB     VBD    verb, past tense


In [8]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{84: 3, 85: 1, 90: 2, 92: 3, 94: 1, 97: 1, 100: 1}

In [9]:
doc.vocab[84].text

'ADJ'

In [10]:
for k,v in sorted(POS_counts.items()):

    print(f"{k}. {doc.vocab[k].text}: {v}")

84. ADJ: 3
85. ADP: 1
90. DET: 2
92. NOUN: 3
94. PART: 1
97. PUNCT: 1
100. VERB: 1


In [11]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):

    print(f"{k}. {doc.vocab[k].text}: {v}")

74. POS: 1
1292078113972184607. IN: 1
10554686591937588953. JJ: 3
12646065887601541794. .: 1
15267657372422890137. DT: 2
15308085513773655218. NN: 3
17109001835818727656. VBD: 1


In [12]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):

    print(f"{k}. {doc.vocab[k].text}: {v}")

402. amod: 3
415. det: 2
429. nsubj: 1
439. pobj: 1
440. poss: 1
443. prep: 1
445. punct: 1
8110129090154140942. case: 1
8206900633647566924. ROOT: 1


# Visualizing Parts Of Speech

In [13]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [14]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [15]:
displacy.render(doc, style = "dep", jupyter = True, options = {"distance":110})

In [16]:
#displacy.serve(doc, style = "dep",options = {"distance":110})

In [17]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{7}} {token.dep_:{7}} {spacy.explain(token.dep_)}')

The        DET     det     determiner
quick      ADJ     amod    adjectival modifier
brown      ADJ     amod    adjectival modifier
fox        NOUN    nsubj   nominal subject
jumped     VERB    ROOT    None
over       ADP     prep    prepositional modifier
the        DET     det     determiner
lazy       ADJ     amod    adjectival modifier
dog        NOUN    poss    possession modifier
's         PART    case    case marking
back       NOUN    pobj    object of preposition
.          PUNCT   punct   punctuation


In [18]:
doc2 = nlp(u"This is a sentence. This is another, possibly longer sentence.")

# Create spans from Doc.sents:
spans = list(doc2.sents)

displacy.render(spans, style='dep',jupyter = True, options={'distance': 110})

In [19]:
options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}

displacy.render(spans, style='dep',jupyter = True, options=options)

# Named Entity Recognition (NER)

In [20]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [21]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc)

Washington - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [22]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [23]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [24]:
from spacy.tokens import Span

ORG = doc.vocab.strings[u"ORG"]

new_ent = Span(doc, 0, 1, label=ORG)

doc.ents = list(doc.ents) + [new_ent]

In [25]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


---

In [26]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

In [27]:
show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [28]:
# Amaç yerlerini bulmak !! 
# start ve end ile !!

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(doc.vocab)

In [29]:
phrase_list = ["vacuum cleaner","vacuum-cleaner"]
phrase_pattern = [nlp(text) for text in phrase_list]

In [30]:
matcher.add("newproduct",None,*phrase_pattern)
matches = matcher(doc)
print(matches)

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]


In [31]:
from spacy.tokens import Span

PROD = doc.vocab.strings[u"PRODUCT"]
new_ents = [Span(doc,match[1],match[2], label = PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [32]:
doc.ents

(vacuum cleaner, vacuum cleaner, first)

In [33]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


In [34]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [35]:
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufacturers - manufacturers - pobj - toward


In [36]:
len(list(doc.noun_chunks))

3

# Visualizing Named Entities

In [37]:
import spacy
from spacy.matcher import Matcher,PhraseMatcher
from spacy.tokens import Span
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

In [38]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

In [39]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [40]:
doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, my kids sold a lot of lemonade.')

In [41]:
for sent in doc2.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

  "__main__", mod_spec)


In [42]:
for sent in doc2.sents:
    docx = nlp(sent.text)
    #print(docx)
    if docx.ents:
            displacy.render(nlp(sent.text), style='ent', jupyter=True)
    else:
        print(docx.text)

By contrast, my kids sold a lot of lemonade.


In [43]:
# Kısıtlama !!!
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(doc, style='ent', jupyter=True, options=options)

In [44]:
colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'PRODUCT': 'radial-gradient(yellow, green)'}

options = {'ents': ['ORG', 'PRODUCT'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options=options)

# Sentence Segmentation

In [45]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [46]:
doc.sents[1]

TypeError: ignored

In [47]:
doc_sents = [sent for sent in doc.sents]
doc_sents[1]

This is another sentence.

In [48]:
print(doc_sents[2])

This is the last sentence.


In [49]:
type(doc_sents[1])

spacy.tokens.span.Span

In [50]:
print(doc_sents[1].start, doc_sents[1].end)

6 11


In [51]:
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')

for token in doc2:
    print(token.is_sent_start, " " + token.text)

True  This
None  is
None  a
None  sentence
None  .
True  This
None  is
None  a
None  sentence
None  .
True  This
None  is
None  a
None  sentence
None  .


In [52]:
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter
Drucker


In [53]:
def set_custom_boundaries(doc):

    for token in doc[:-1]:
        if token.text == ";":
            doc[token.i+1].is_sent_start =True
    return doc

# before -- > hangisinden önce !!!

nlp.add_pipe(set_custom_boundaries, before = "parser")
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [54]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter
Drucker


In [55]:
for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter
Drucker


In [56]:
doc3[7].text

'leadership'

In [57]:
# üste ki script kabul etti
doc3[7].is_sent_start = True

ValueError: ignored

In [62]:
nlp = spacy.load('en_core_web_sm')
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
doc = nlp(mystring)

for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [63]:
def split_on_newlines(doc):

    start = 0
    seen_newline = False
    for word in doc:
        # yakaldı misal 8 de 
        # doc[0 : 8]
        # sonra doc[8 :]
        # Dikkat !!!
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith("\n"):
            seen_newline = True
    yield doc[start:]

In [64]:
from spacy.pipeline import SentenceSegmenter
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

In [65]:
nlp.pipe_names

['tagger', 'parser', 'ner', 'sentencizer_hook']

In [66]:
doc = nlp(mystring)
for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n']
['third', 'sentence', '.']
