In [7]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install spacy_stanza

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting spacy_stanza
  Downloading spacy_stanza-1.0.4-py3-none-any.whl (9.7 kB)
Collecting stanza<1.7.0,>=1.2.0 (from spacy_stanza)
  Downloading stanza-1.6.1-py3-none-any.whl (881 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m881.2/881.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting emoji (from stanza<1

In [3]:
import spacy
def dependency_parse(sentence):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(sentence)
  dependencies = []
  for token in doc:
    dependencies.append((token.text, token.dep_, token.head.text))
  return dependencies

sentence = "The quick brown fox jumps over the lazy dog."
dependencies = dependency_parse(sentence)
for dep in dependencies:
  print(dep)

('The', 'det', 'fox')
('quick', 'amod', 'fox')
('brown', 'amod', 'fox')
('fox', 'nsubj', 'jumps')
('jumps', 'ROOT', 'jumps')
('over', 'prep', 'jumps')
('the', 'det', 'dog')
('lazy', 'amod', 'dog')
('dog', 'pobj', 'over')
('.', 'punct', 'jumps')


In [4]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(sentence)
displacy.render(doc, style="dep", jupyter=True)

In [5]:
dependency_rules = {
    'jumps': {'nsubj': 'fox', 'prep': 'over'},
    'fox': {'det': 'The', 'amod': 'quick', 'amod': 'brown'},
    'over': {'det': 'the', 'amod': 'lazy', 'probj': 'dog'},
}

def extract_dependencies(sentence, dependency_rules):
  dependencies = []
  words = sentence.split()
  for i, word in enumerate(words):
    if word in dependency_rules:
      for dep, dep_word in dependency_rules[word].items():
        dependencies.append((dep_word, dep, word))
  return dependencies


dependencies = extract_dependencies(sentence, dependency_rules)

for dep in dependencies:
  print(dep)

('The', 'det', 'fox')
('brown', 'amod', 'fox')
('fox', 'nsubj', 'jumps')
('over', 'prep', 'jumps')
('the', 'det', 'over')
('lazy', 'amod', 'over')
('dog', 'probj', 'over')


<h1><i>Assignment</i></h1>

In [9]:
import spacy_stanza

# Load spaCy with stanza models for English and Hindi
nlp_en = spacy_stanza.load_pipeline("en")
nlp_hi = spacy_stanza.load_pipeline("hi")

def parse_multilingual_sentence(sentence):
    # Detect the language of the sentence
    lang = detect_language(sentence)

    # Parse the sentence based on detected language
    if lang == "en":
        doc = nlp_en(sentence)
    elif lang == "hi":
        doc = nlp_hi(sentence)
    else:
        raise ValueError("Unsupported language")

    # Extract dependency relations
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]

    return dependencies

def detect_language(text):
    # Simple language detection based on characters
    # This is a naive approach and might not be very accurate
    en_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
    hi_characters = set("ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहॠ।॥")

    if any(char in en_characters for char in text):
        return "en"
    elif any(char in hi_characters for char in text):
        return "hi"
    else:
        raise ValueError("Language detection failed")

# Example sentences
sentences = [
    "Hello may thik hoon.",
    "I ate my food now.",
    "मैं सो जाएगा"
]

for sentence in sentences:
    print("Sentence:", sentence)
    print("Dependencies:")
    print(parse_multilingual_sentence(sentence))
    print()


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/depparse/combined_charlm.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/sentiment/sstplus.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/ner/ontonotes_charlm.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/pretrain/fasttextcrawl.pt:   0%…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/backward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/pretrain/conll17.pt:   0%|     …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/tokenize/hdtb.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/pos/hdtb_charlm.pt:   0%|      …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/lemma/hdtb_nocharlm.pt:   0%|  …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/depparse/hdtb_charlm.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/forward_charlm/oscar.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/backward_charlm/oscar.pt:   0%|…

Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.6.0/models/pretrain/conll17.pt:   0%|     …

INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Sentence: Hello may thik hoon.
Dependencies:
[('Hello', 'nsubj', 'may'), ('may', 'root', 'may'), ('thik', 'discourse', 'hoon'), ('hoon', 'vocative', 'may'), ('.', 'punct', 'may')]

Sentence: I ate my food now.
Dependencies:
[('I', 'nsubj', 'ate'), ('ate', 'root', 'ate'), ('my', 'nmod:poss', 'food'), ('food', 'obj', 'ate'), ('now', 'advmod', 'ate'), ('.', 'punct', 'ate')]

Sentence: मैं सो जाएगा
Dependencies:
[('मैं', 'nsubj', 'सो'), ('सो', 'root', 'सो'), ('जाएगा', 'aux', 'सो')]



In [10]:
sentence = 'Hello mai theek nahi hoon.I have been studying my entire day. Now, मैं अब और पढ़ाई नहीं कर सकता.'
dependency_rules = {
    'Hello': {},
    'mai': {'nsubj': 'hoon'},
    'theek': {'amod': 'mai'},
    'nahi': {'advmod': 'hoon'},
    'hoon': {'nsubj': 'mai', 'advmod': 'nahi', 'ROOT': True},
    'I': {'nsubj': 'studying'},
    'have': {'aux': 'been'},
    'been': {'aux': 'studying'},
    'studying': {'nsubj': 'I', 'dobj': 'day', 'poss': 'my', 'aux': 'been'},
    'my': {'poss': 'day'},
    'entire': {'amod': 'day'},
    'day': {'dobj': 'studying', 'poss': 'my', 'amod': 'entire'},
    'Now,': {},
    'मैं': {'nsubj': 'सकता'},
    'अब': {},
    'और': {},
    'पढ़ाई': {'dobj': 'कर', 'amod': 'और'},
    'नहीं': {'advmod': 'सकता'},
    'कर': {'dobj': 'पढ़ाई'},
    'सकता': {'aux': 'नहीं', 'nsubj': 'मैं', 'ROOT': True},
    '.': {},
}


def extract_dependencies(sentence,dependency_rules):
    dependencies = []
    words = sentence.split()
    for i , word in enumerate(words):
        if word in dependency_rules:
            for dep,dep_word in dependency_rules[word].items():
                dependencies.append((dep_word,dep,word))
    return dependencies

dependencies = extract_dependencies(sentence,dependency_rules)
for dep in dependencies:
    print(dep)


nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
displacy.render(doc, style = "dep", jupyter = True)

('hoon', 'nsubj', 'mai')
('mai', 'amod', 'theek')
('hoon', 'advmod', 'nahi')
('been', 'aux', 'have')
('studying', 'aux', 'been')
('I', 'nsubj', 'studying')
('day', 'dobj', 'studying')
('my', 'poss', 'studying')
('been', 'aux', 'studying')
('day', 'poss', 'my')
('day', 'amod', 'entire')
('सकता', 'nsubj', 'मैं')
('कर', 'dobj', 'पढ़ाई')
('और', 'amod', 'पढ़ाई')
('सकता', 'advmod', 'नहीं')
('पढ़ाई', 'dobj', 'कर')
