In [26]:
import spacy

nlp = spacy.load("en_core_web_sm") #trained 

doc = nlp("Pacific ocean") #nlp tokenizes text and creates doc object
token = doc[0]  #select the first token Pacific 

print(token.text)

Pacific


In [27]:
print(f"Token: {token.text}")
print(f"Lemma: {token.lemma}, Dependency: {token.dep_}")
print(f"Part of Speech: {token.tag_}, Shape: {token.shape_}, Entity Type: {token.ent_type_}")

Token: Pacific
Lemma: 7543293588541783153, Dependency: compound
Part of Speech: NNP, Shape: Xxxxx, Entity Type: LOC


In [23]:
print(f" Vector: {token.vector}")

 Vector: [ 7.39085555e-01  5.25682569e-01  2.57804394e-01 -8.24920654e-01
 -9.66312289e-02  2.25829601e+00 -4.07572150e-01  4.88600016e-01
 -1.12417674e+00 -2.66576946e-01 -7.24266529e-01  5.63909829e-01
 -4.29322943e-03  3.18455517e-01 -4.00206178e-01 -7.42534757e-01
  8.94871354e-02  6.93928957e-01  1.18385804e+00  2.33339190e+00
 -5.19754529e-01 -1.10967648e+00  2.37156451e-02  6.94226086e-01
  3.46426904e-01 -5.54768026e-01  2.58785248e-01 -5.09009361e-01
  1.41621363e+00 -6.01309359e-01 -8.77647251e-02  2.28351325e-01
 -6.57674134e-01  2.96291947e-01  8.46246839e-01 -1.17196918e+00
 -1.40711880e+00  1.40986252e+00 -5.13012826e-01  1.65633166e+00
 -1.25865281e-01 -8.83003950e-01  1.00630736e+00 -2.52338082e-01
 -9.54092801e-01  1.23219691e-01 -1.02663487e-02 -5.84007621e-01
  1.21982050e+00 -3.68187338e-01  4.74482924e-01  6.45294189e-02
 -1.59020126e-01 -7.70856023e-01  3.05951953e-01  1.36112913e-01
 -5.84416151e-01 -6.22123837e-01 -8.22847784e-01 -6.19708002e-02
 -1.85400867e+00

In [51]:
#Using rule based matching to capture: null hypothesis, p-value
#import matcher

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm") #trained 

doc = nlp("We failed to reject the null hypothesis with a p-value of .01")

#Printing each token
for token in doc:
    print(f"token: {token}")

token: We
token: failed
token: to
token: reject
token: the
token: null
token: hypothesis
token: with
token: a
token: p
token: -
token: value
token: of
token: .01


In [50]:
#initialize Matcher with shared vocab
matcher = Matcher(nlp.vocab)

#create the pattern to capture instances of null hypothesis and p-value/p value
patterns = [[{"TEXT": "null"}, {"TEXT": "hypothesis"}],[{"TEXT": "p"}, {}, {"TEXT": "value"}]]

# Add the pattern to the matcher
matcher.add("HYP_PATTERN", patterns)


# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])



Matches: ['null hypothesis', 'p-value']


In [2]:
#Using PhraseMatcher and a custom list

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
doc = nlp("This study pointed to the assessment of the chemical composition (F, Ca, Mg, Na, K, Fe, Mn, Zn, Cu, Ni,         \
        Co, Cr, Cd, and carbohydrate) of different marine seaweeds (red, green, and brown) from the                     \
        Egyptian Mediterranean Sea coast.")

metals = ["F", "Ca", "Mg", "Na", "K", "Fe", "Mn", "Zn", "Cu", "Ni", "Co", "Cr", "Cd"]

# Initialize the PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
#Use nlp.pipe() when processing large volumes of text
patterns = list(nlp.pipe(metals)) 
matcher.add("METALS_PATTERN", patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

[F, Ca, Mg, Na, K, Fe, Mn, Zn, Cu, Ni, Co, Cr, Cd]


In [45]:
# Iterate over the matches
for match_id, start, end in matches:
    # Create a Span with the label for "metals"
    span = Span(doc, start, end, label="Metal")   

    # Overwrite the doc.ents and add the span
    doc.ents = [span] 


    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    #print(span_root_head.text, "-->", span.text)

    # Print the entities in the document
    print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "Metal"])

[('F', 'Metal')]
[('Ca', 'Metal')]
[('Mg', 'Metal')]
[('Na', 'Metal')]
[('K', 'Metal')]
[('Fe', 'Metal')]
[('Mn', 'Metal')]
[('Zn', 'Metal')]
[('Cu', 'Metal')]
[('Ni', 'Metal')]
[('Co', 'Metal')]
[('Cr', 'Metal')]
[('Cd', 'Metal')]


In [52]:
#retrieve the indices (called spans) of each match in the text
for mid, start, end in matches:
    print(start, end, doc[start:end])

11 12 F
13 14 Ca
15 16 Mg
17 18 Na
19 20 K
21 22 Fe
23 24 Mn
25 26 Zn
27 28 Cu
29 30 Ni
31 32 Co
33 34 Cr
35 36 Cd


In [53]:

import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("This study pointed to the assessment of the chemical composition (F, Ca, Mg, Na, K, Fe, Mn, Zn, Cu, Ni, Co, Cr, Cd, and carbohydrate) of different marine seaweeds (red, green, and brown) from the Egyptian Mediterranean Sea coast. The results showed that green seaweeds supplied better calcium sources than the red and brown ones. Also, red and brown seaweeds showed higher averages of Na and K than that in green species and these seaweeds could play an important role in the electrolyte balance in humans.")


displacy.serve(doc, style= "ent") #dep style parameter

filename = "cat_line.svg" 
output_path = Path ("/Users/tdubon/Learning/spaCy" + file_name)
output_path.open("w", encoding = "utf-8").write(svg)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


NameError: name 'Path' is not defined