In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')
for token in doc:
    print(token.text,token.pos_,token.dep_) # pos gives part of speech/dep stands for syntax dependencies

print("\n")
print(nlp.pipeline)
print(nlp.pipe_names)
print("\n")

doc2 = nlp(u"Tesla isn't looking into startups anymore")
for token in doc2:
    print(token.text,token.pos_,token.dep_)

print("\n")
print(doc2[0])
print(doc2[0].pos_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x000001DC6F86F8F0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000001DC6F86D9D0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x000001DC6E7CA7A0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000001DC712B3E10>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x000001DC6D89D050>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x000001DC6E7CA880>)]
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod


Tesla
PROPN


In [10]:
doc3 = nlp(u"Although commonly attributed to John Lennon from his song 'Beautiful Boy',\
           'Life is what happens to us while we are making other plans' was written by \
           cartoonistAllen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.")

life_quote = doc3[16:29]
print(life_quote)
print(type(life_quote))
print(type(doc3))

Life is what happens to us while we are making other plans'
<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.doc.Doc'>


In [15]:
doc4 = nlp("This is the first sentence. This is another sentence. This is the last sentence.")
for sentence in doc.sents:
    print(sentence)

print(doc4[6])

print(doc4[6].is_sent_start)
print(doc4[7])
print(doc4[8].is_sent_start)
print(doc4[9].is_sent_end)

Tesla is looking at buying U.S. startup for $6 million
This
True
is
False
False


In [21]:
import spacy

nlp = spacy.load('en_core_web_sm')
mystring = '"We\'re moving to L.A.!"'
print(mystring)

# doc = nlp(mystring)
# for token in doc:
#     print(token.text)

# print("\n")
# doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")
# for t in doc2:
#     print(t)

# print("\n")
# doc3 = nlp(u"A 5km NYC cab ride costs $10.30")
# for t in doc3:
#     print(t)

# doc4 = nlp(u"Let's visit St.Louis in the U.S. next year")
# for t in doc4:
#     print(t)
# print(len(doc4))

print(doc4.vocab)
print(len(doc4.vocab))

"We're moving to L.A.!"
<spacy.vocab.Vocab object at 0x000001DC65A00160>
794


In [23]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
for token in doc8:
    print(token.text,end='|')

Apple|to|build|a|Hong|Kong|factory|for|$|6|million|

In [25]:
for entity in doc8.ents:
    print(entity,entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple ORG
Companies, agencies, institutions, etc.


Hong Kong GPE
Countries, cities, states


$6 million MONEY
Monetary values, including unit




In [27]:
doc9 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [30]:
from spacy import displacy
doc = nlp(u"Apple is going to build a U.K factory for $6 million.")
displacy.render(doc,style='dep',options={'distance':120}) # here the distance is used to adjust the size of the image
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousands iPods for a profit of $6 million.")
displacy.render(doc,style='ent')

In [35]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')
words = ['run','runner','ran','runs','easily','fairly','fairness']

for word in words:
    print(word+'---->',p_stemmer.stem(word))

print('\n')

words = ['generous','generation','generously','generate']

print('\n')
for word in words:
    print(word+'---->'+s_stemmer.stem(word))

run----> run
runner----> runner
ran----> ran
runs----> run
easily----> easili
fairly----> fairli
fairness----> fair




generous---->generous
generation---->generat
generously---->generous
generate---->generat


In [39]:
import spacy 
nlp = spacy.load('en_core_web_sm')
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)

def show_lemmas(text):
    for token in text:
        print(f"{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}")

print('\n')
doc2 = nlp(u'I saw ten mice today!')
show_lemmas(doc2)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [49]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Check current stop words and count
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

# Check if specific words are stop words
print(nlp.vocab['is'].is_stop)  # 'is' is a stop word
print(nlp.vocab['mystery'].is_stop)  # 'mystery' is not a stop word

# Add a new stop word
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True  # Mark 'btw' as a stop word in the vocab
print(nlp.vocab['btw'].is_stop)  # Now 'btw' is a stop word

# Check if 'beyond' is a stop word
print(nlp.vocab['beyond'].is_stop)  # 'beyond' is originally a stop word

# Safely remove 'beyond' from stop words if it's there
if 'beyond' in nlp.Defaults.stop_words:
    nlp.Defaults.stop_words.remove('beyond')
    nlp.vocab['beyond'].is_stop = False  # Mark 'beyond' as not a stop word

# Check if 'beyond' is still a stop word
print(nlp.vocab['beyond'].is_stop)


{'they', '’ll', 'least', 'could', 'put', 'themselves', 'whose', 're', 'seem', 'becomes', 'been', 'beside', 'sometimes', 'eight', 'whenever', 'me', 'now', 'either', 'about', '’m', 'nor', 'whom', 'someone', 'everywhere', 'however', 'its', 'hereupon', 'give', 'whatever', 'ten', 'whence', 'we', 'four', 'anyone', 'was', 'part', 'using', 'alone', 'being', 'our', 'five', 'take', 'make', 'off', 'with', 'than', 'be', 'still', 'while', 'some', 'where', 'towards', 'top', 'had', 'my', "'m", 'thereafter', 'onto', 'another', 'ourselves', 'latter', 'afterwards', 'becoming', 'whole', 'am', 'from', 'what', 'when', 'a', 'besides', 'third', 'over', 'no', 'further', 'mostly', 'do', '’s', 'or', 'since', 'well', 'hence', 'rather', 'two', 'everything', 'call', 'such', 'six', 'moreover', 'thru', 'if', 'serious', 'please', 'twenty', 'and', 'also', 'because', 'show', 'n‘t', 'seeming', 'namely', 'who', 'those', 'by', 'sixty', 'hers', 'whereby', 'hereby', 'see', 'may', 'out', 'them', 'during', 'ours', 'meanwhile'

In [54]:
import spacy
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
# SolarPower 
# Solar-power
# Solar power

# the lower is to get the lowercase form of the token text
# the length is used to get the length of a text,orth the exact verbatim text of a token,IS_PUNCT,IS_ASCII,IS_DIGIT,IS_UPPER,IS_STOP,IS_LOWER - are all used for the pattern purposes read more in the course

pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]
matcher.add('SolarPower',[pattern1,pattern2,pattern3])  # here SolarPower is the name of the matcher

doc = nlp(u'The Solar Power industry continues to grow a solarpower increases. Solar-power is amazing')
found_matches=matcher(doc)
print(found_matches)

for match_id,start,end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id,string_id,start,end,span.text)


print("\n")
matcher.remove('SolarPower')
# solarpower,SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# solar--power  / solar.power etc
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]
matcher.add('SolarPower',[pattern1,pattern2])

doc2 = nlp(u'Solar--power is solarpower yay!')

found_matches=matcher(doc2)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]
8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [59]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

with open(r"C:\Users\Vaibhav\OneDrive\Documents\FolderPython\Artificial_Intelligence\Natural_Language_Processing\Nlp_revision\TextFiles\reaganomics.txt") as f:
    doc = nlp(f.read())

phrase_list =['voodoo economics','supply-side economics','trickle-down economics','free-market economics']
phrase_matcher = PhraseMatcher(nlp.vocab)
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_matcher.add('EconMatcher',None,*phrase_patterns)
found_matches = phrase_matcher(doc)

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # Get the string representation
    span = doc[start:end]  # The matched span
    # span = doc[start-2:end+2] # this gives +-2 words
    print(f'Match ID: {string_id}, Text: {span.text}, Start: {start}, End: {end}')


Match ID: EconMatcher, Text: supply-side economics, Start: 41, End: 45
Match ID: EconMatcher, Text: trickle-down economics, Start: 49, End: 53
Match ID: EconMatcher, Text: voodoo economics, Start: 54, End: 56
Match ID: EconMatcher, Text: free-market economics, Start: 61, End: 65
Match ID: EconMatcher, Text: supply-side economics, Start: 673, End: 677
Match ID: EconMatcher, Text: trickle-down economics, Start: 2987, End: 2991
