In [1]:
import spacy 

# Data Structure 1: string, hash, vocab, lexeme

In [10]:
nlp = spacy.load('en_core_web_sm')

In [11]:
doc = nlp('I love coffee')

In [12]:
coffee_hash = nlp.vocab.strings['coffee']

In [13]:
coffee_string = nlp.vocab.strings[coffee_hash]

In [6]:
string = nlp.vocab.strings[3197928453018144401]

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'."

In [14]:
doc.vocab.strings['coffee']

3197928453018144401

In [15]:
doc.vocab.strings[3197928453018144401]

'coffee'

In [29]:
lexeme = nlp.vocab['12']

In [30]:
lexeme

<spacy.lexeme.Lexeme at 0x7f9ee21dfb88>

In [31]:
lexeme.text, lexeme.is_alpha, lexeme.sentiment, lexeme.orth

('12', False, 0.0, 1124146173557384544)

In [32]:
bowie_id = nlp.vocab.strings['Bowie']

# Data Structure 2: Doc, Token, Span

In [34]:
from spacy.tokens import Doc 

In [51]:
words = ['Hello', 'London', '!']
spaces = [True, False, False]

In [52]:
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [53]:
for tkn in doc: 
    print(f'{tkn.text}')

Hello
London
!


In [54]:
from spacy.tokens import Span

In [55]:
span = Span(doc, 0, 2)

In [56]:
span.text

'Hello London'

In [73]:
span_with_label = Span(doc, 0, 3, label='GREETING')

In [74]:
span_with_label.text 

'San Francisco considers'

In [77]:
doc.ents.add([span_with_label])

AttributeError: 'tuple' object has no attribute 'add'

In [70]:
for ent in doc.ents: 
    print(f'{ent.text}: {ent.label_}')

San Francisco considers: GREETING


In [71]:
doc = nlp(u"San Francisco considers banning sidewalk delivery robots")

In [72]:
for ent in doc.ents: 
    print(f'{ent.text}: {ent.label_}')

San Francisco: GPE


# Vector and Semantic Similarities

In [81]:
nlp = spacy.load('en_core_web_lg')

In [100]:
doc1 = nlp('I like BYRON!')
doc2 = nlp('I eat HONEST burger three times a week')

In [101]:
similarity = doc1.similarity(doc2)
print(similarity)

0.6928556422326514


# Combining models and rules 

In [102]:
from spacy.matcher import Matcher 

In [103]:
matcher = Matcher(nlp.vocab)

In [105]:
pattern = [{'LEMMA': 'love', 'POS': 'VERB'}, {'LOWER': 'cats'}]
matcher.add('LOVE_CATS', None, pattern)

In [110]:
pattern = [{'TEXT': 'very', 'OP': '+'}, {'TEXT': 'happy'}]
matcher.add('VERY_HAPPY', None, pattern)

In [111]:
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

In [115]:
for match_id, start, end in matches: 
    print(f'match id: {match_id}')
    match_span = doc[start:end]
    print(f'matched results: {match_span.text}')
    print(f'matched results: {match_span.root.text}')
    print(f'matched results: {match_span.root.head.text}')

match id: 9137535031263442622
matched results: love cats
matched results: love
matched results: love
match id: 2447047934687575526
matched results: very happy
matched results: happy
matched results: 'm
match id: 2447047934687575526
matched results: very very happy
matched results: happy
matched results: 'm


In [116]:
from spacy.matcher import PhraseMatcher 

In [117]:
matcher = PhraseMatcher(nlp.vocab)

In [118]:
pattern = nlp("Golden Retriever")
matcher.add('DOG', None, pattern)
doc = nlp("I have a Golden Retriever")

In [119]:
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print('Matched span:', span.text)

Matched span: Golden Retriever


In [120]:
spacy.explain('PROPN')

'proper noun'

In [125]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad-free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

In [126]:
len(matcher(doc))

0

In [127]:
[token.text for token in nlp("ad-free viewing")]

['ad', '-', 'free', 'viewing']

In [128]:
[token.pos_ for token in nlp("ad-free viewing")]

['NOUN', 'PUNCT', 'ADJ', 'NOUN']

In [129]:
*[token.pos_ for token in nlp("ad-free viewing")]

SyntaxError: can't use starred expression here (<ipython-input-129-20018f4df100>, line 4)