# Spacy Demo01

In [None]:
# import libraries
!pip install spacy
import spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




In [None]:
# download english language model
# There are 3 models: small, medium, large
# The small model takes around 4 secs to load
# The large model takes around 3 mins
# Note that some functions work only in medium or large models

%time !python -m spacy download en_core_web_md

2023-02-16 02:47:17.017547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-16 02:47:17.017705: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-16 02:47:20.102139: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download

In [None]:
# intantiate nlp class for english
nlp = spacy.load("en_core_web_md")

In [None]:
# tokenisation,
# doc = nlp('Drinking a glass of wine is good for your wellbeing!')
doc = nlp('''Such an analysis can reveal features that are not easily visible from the variation in the individual genes and can lead to a picture of expression that is more biologically transparent and accessible to interpretation
''')
for token in doc:
    print(f"token:{token}\t tag:{token.tag_}\t\tPOS:{token.pos_}\t\t text:'{token.text}' \tlemma:{token.lemma_}\t ")

token:Such	 tag:PDT		POS:DET		 text:'Such' 	lemma:such	 
token:an	 tag:DT		POS:DET		 text:'an' 	lemma:an	 
token:analysis	 tag:NN		POS:NOUN		 text:'analysis' 	lemma:analysis	 
token:can	 tag:MD		POS:AUX		 text:'can' 	lemma:can	 
token:reveal	 tag:VB		POS:VERB		 text:'reveal' 	lemma:reveal	 
token:features	 tag:NNS		POS:NOUN		 text:'features' 	lemma:feature	 
token:that	 tag:WDT		POS:PRON		 text:'that' 	lemma:that	 
token:are	 tag:VBP		POS:AUX		 text:'are' 	lemma:be	 
token:not	 tag:RB		POS:PART		 text:'not' 	lemma:not	 
token:easily	 tag:RB		POS:ADV		 text:'easily' 	lemma:easily	 
token:visible	 tag:JJ		POS:ADJ		 text:'visible' 	lemma:visible	 
token:from	 tag:IN		POS:ADP		 text:'from' 	lemma:from	 
token:the	 tag:DT		POS:DET		 text:'the' 	lemma:the	 
token:variation	 tag:NN		POS:NOUN		 text:'variation' 	lemma:variation	 
token:in	 tag:IN		POS:ADP		 text:'in' 	lemma:in	 
token:the	 tag:DT		POS:DET		 text:'the' 	lemma:the	 
token:individual	 tag:JJ		POS:ADJ		 text:'individual' 	lemma:in

In [None]:
# Named Entity Recognition
# doc = nlp("He was born in Canberra, Australia in 14/1/1974")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(f"Entity: {ent.text} \t\t type:{ent.label_}")

Entity: Apple 		 type:ORG
Entity: U.K. 		 type:GPE
Entity: $1 billion 		 type:MONEY


In [None]:
# Display tag alongside the text
from spacy import displacy

doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# noun-phrase chunking
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(f"Text:{chunk.text},\t label:{chunk.label_},\t root:{chunk.root.text}")

Text:Wall Street Journal,	 label:NP,	 root:Journal
Text:an interesting piece,	 label:NP,	 root:piece
Text:crypto currencies,	 label:NP,	 root:currencies


In [None]:
# grammar dependency tree parsing and visualisation
from spacy import displacy

doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [None]:
# Rule-base matcher
# import spacy Matcher
from spacy.matcher import Matcher
# create a matcher
matcher = Matcher(nlp.vocab)
# define a function to extract full name
def extract_full_name(text: str):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', [pattern])
    doc = nlp(text)
    names = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        names.append(span.text)
    return names

In [None]:
# Find full name in sentence
full_names = extract_full_name("I met John Richardson almost a year after Daniel Zhang married Lucy Khan")
print(f"Full names: {full_names}")

Full names: ['John Richardson', 'Daniel Zhang', 'Lucy Khan']


In [None]:
# Removing stop words
from spacy.lang.en.stop_words import STOP_WORDS
text = """He determined to drop his litigation with the monastry, and relinquish his claims to the wood-cutting and
fishery rights at once. He was the more ready to do this because the rights had become much less valuable, and he had
indeed the vaguest idea where the wood and river in question were."""
filtered_text =[]
nlp_text = nlp(text)
token_list = []
for token in nlp_text:
    token_list.append(token.text)
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_text.append(word)
print(f"Token list: \n{token_list}")
print(f"\nFiltered text: \n{filtered_text}")

Token list: 
['He', 'determined', 'to', 'drop', 'his', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinquish', 'his', 'claims', 'to', 'the', 'wood', '-', 'cutting', 'and', '\n', 'fishery', 'rights', 'at', 'once', '.', 'He', 'was', 'the', 'more', 'ready', 'to', 'do', 'this', 'because', 'the', 'rights', 'had', 'become', 'much', 'less', 'valuable', ',', 'and', 'he', 'had', '\n', 'indeed', 'the', 'vaguest', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'were', '.']

Filtered text: 
['determined', 'drop', 'litigation', 'monastry', ',', 'relinquish', 'claims', 'wood', '-', 'cutting', '\n', 'fishery', 'rights', '.', 'ready', 'rights', 'valuable', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


In [None]:
# word vectors
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(f"Token:\t{token.text}, has vector:\t{token.has_vector}, token.vector_norm, token.is_oov")
print(f"\nToken 1: {tokens[0]}\n Vector:\n{tokens[0].vector}")

Token:	dog, has vector:	True, token.vector_norm, token.is_oov
Token:	cat, has vector:	True, token.vector_norm, token.is_oov
Token:	banana, has vector:	True, token.vector_norm, token.is_oov
Token:	afskfsd, has vector:	False, token.vector_norm, token.is_oov

Token 1: dog
 Vector:
[ 1.2330e+00  4.2963e+00 -7.9738e+00 -1.0121e+01  1.8207e+00  1.4098e+00
 -4.5180e+00 -5.2261e+00 -2.9157e-01  9.5234e-01  6.9880e+00  5.0637e+00
 -5.5726e-03  3.3395e+00  6.4596e+00 -6.3742e+00  3.9045e-02 -3.9855e+00
  1.2085e+00 -1.3186e+00 -4.8886e+00  3.7066e+00 -2.8281e+00 -3.5447e+00
  7.6888e-01  1.5016e+00 -4.3632e+00  8.6480e+00 -5.9286e+00 -1.3055e+00
  8.3870e-01  9.0137e-01 -1.7843e+00 -1.0148e+00  2.7300e+00 -6.9039e+00
  8.0413e-01  7.4880e+00  6.1078e+00 -4.2130e+00 -1.5384e-01 -5.4995e+00
  1.0896e+01  3.9278e+00 -1.3601e-01  7.7732e-02  3.2218e+00 -5.8777e+00
  6.1359e-01 -2.4287e+00  6.2820e+00  1.3461e+01  4.3236e+00  2.4266e+00
 -2.6512e+00  1.1577e+00  5.0848e+00 -1.7058e+00  3.3824e+00  3.

In [None]:
# Word similarity
tokens = nlp("dog cat banana apple")
for token1 in tokens:
    for token2 in tokens:
        print(f"Token 1:\t{token1.text},\t token 2:{token2.text},\t similarity:{token1.similarity(token2)}")

Token 1:	dog,	 token 2:dog,	 similarity:1.0
Token 1:	dog,	 token 2:cat,	 similarity:0.8220816850662231
Token 1:	dog,	 token 2:banana,	 similarity:0.2090904712677002
Token 1:	dog,	 token 2:apple,	 similarity:0.22881005704402924
Token 1:	cat,	 token 2:dog,	 similarity:0.8220816850662231
Token 1:	cat,	 token 2:cat,	 similarity:1.0
Token 1:	cat,	 token 2:banana,	 similarity:0.2235882580280304
Token 1:	cat,	 token 2:apple,	 similarity:0.2036806046962738
Token 1:	banana,	 token 2:dog,	 similarity:0.2090904712677002
Token 1:	banana,	 token 2:cat,	 similarity:0.2235882580280304
Token 1:	banana,	 token 2:banana,	 similarity:1.0
Token 1:	banana,	 token 2:apple,	 similarity:0.6646699905395508
Token 1:	apple,	 token 2:dog,	 similarity:0.22881005704402924
Token 1:	apple,	 token 2:cat,	 similarity:0.2036806046962738
Token 1:	apple,	 token 2:banana,	 similarity:0.6646699905395508
Token 1:	apple,	 token 2:apple,	 similarity:1.0


In [None]:
# Sentiment analysis with textBlob
!pip install textBlob
from textblob import TextBlob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Detect sentiment of a text
# text = "Textblob is amazingly simple to use. What great fun!"
text="I am so happy with my progress in the data science course"
textBlob = TextBlob(text)
print(f"{textBlob.sentiment}")

Sentiment(polarity=0.8, subjectivity=1.0)




---



---



> > > > > > > > > © 2024 Institute of Data


---



---



