# **Intro**

In [3]:
!pip install -U spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_md")


In [5]:
text = "The quick brown fox jumps over the lazy dog."
doc = nlp(text)

for token in doc:
    print(token.text, token.dep_, token.head.text)

displacy.serve(doc, style="dep")

The det fox
quick amod fox
brown amod fox
fox nsubj jumps
jumps ROOT jumps
over prep jumps
the det dog
lazy amod dog
dog pobj over
. punct jumps





Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [6]:
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

print(f"{'TEXT':<12} | {'DEP':<10} | {'HEAD TEXT':<12} | {'HEAD POS':<8} | CHILDREN")
print("-" * 70)

for token in doc:
    children = [child.text for child in token.children]
    print(f"{token.text:<12} | {token.dep_:<10} | {token.head.text:<12} | {token.head.pos_:<8} | {children}")


TEXT         | DEP        | HEAD TEXT    | HEAD POS | CHILDREN
----------------------------------------------------------------------
Apple        | nsubj      | looking      | VERB     | []
is           | aux        | looking      | VERB     | []
looking      | ROOT       | looking      | VERB     | ['Apple', 'is', 'at']
at           | prep       | looking      | VERB     | ['buying']
buying       | pcomp      | at           | ADP      | ['startup']
U.K.         | compound   | startup      | NOUN     | []
startup      | dobj       | buying       | VERB     | ['U.K.', 'for']
for          | prep       | startup      | NOUN     | ['billion']
$            | quantmod   | billion      | NUM      | []
1            | compound   | billion      | NUM      | []
billion      | pobj       | for          | ADP      | ['$', '1']


In [7]:
text = "The cat chased the mouse and the dog watched them."
doc = nlp(text)

for token in doc:
    if token.pos_ == "VERB":
        verb = token.text
        subject = ""
        obj = ""

        for child in token.children:
            if child.dep_ == "nsubj":
                subject = child.text
            if child.dep_ == "dobj":
                obj = child.text

        if subject and obj:
            print(f"Found Triplet: ({subject}, {verb}, {obj})")


Found Triplet: (cat, chased, mouse)
Found Triplet: (dog, watched, them)


In [8]:
text = "The big, fluffy white cat is sleeping on the warm mat."
doc = nlp(text)

for token in doc:
    if token.pos_ == "NOUN":
        adjectives = []
        for child in token.children:
            if child.dep_ == "amod":
                adjectives.append(child.text)

        if adjectives:
            print(f"Danh từ '{token.text}' được bổ nghĩa bởi các tính từ: {adjectives}")


Danh từ 'cat' được bổ nghĩa bởi các tính từ: ['big', 'fluffy', 'white']
Danh từ 'mat' được bổ nghĩa bởi các tính từ: ['warm']


# **BT1**

In [10]:
def find_main_verb(doc):
    for token in doc:
        if token.dep_ == "ROOT":
            return token
    return None

doc = nlp("The quick brown fox jumps over the lazy dog.")
print(find_main_verb(doc))


jumps


# **BT2**

In [12]:
def extract_noun_chunks(doc):
    chunks = []
    for token in doc:
        if token.pos_ == "NOUN":
            left_mods = []
            for child in token.children:
                if child.dep_ in ["det", "amod", "compound"]:
                    left_mods.append(child)
            # sắp xếp theo thứ tự xuất hiện trong câu
            chunk_tokens = sorted(left_mods + [token], key=lambda x: x.i)
            chunk_text = " ".join([t.text for t in chunk_tokens])
            chunks.append(chunk_text)
    return chunks

doc = nlp("The big white cat ate the warm food.")
print(extract_noun_chunks(doc))

['The big white cat', 'the warm food']


# **BT3**

In [13]:
def get_path_to_root(token):
    path = [token]
    while token.dep_ != "ROOT":
        token = token.head
        path.append(token)
    return path

doc = nlp("The cat chased the mouse.")
tok = doc[1]   # cat
path = get_path_to_root(tok)
print([t.text for t in path])


['cat', 'chased']
