# Utility Functions for Dependency Parser

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

In [3]:
%%capture
!pip install datasketch
!pip install -U spacy
!python -m spacy download de_dep_news_trf
!pip install stanza
!python -c "import stanza; stanza.download(lang='de')"
!pip install trankit
!python -c "import trankit; trankit.Pipeline(lang='german', gpu=False, cache_dir='../cache')"

## Toy Data

In [4]:
# text examples that contain one or more sentences
examples = [
    "Als Ada Lovelace auf einem Ball den Mathematiker Charles Babbage traf, der sie einlud, die von ihm erfundene „Differenzmaschine“ anzusehen, war sie hellauf begeistert.",
    "Die Maschine konnte selbstständig addieren und subtrahieren, doch Ada war klar, dass die Möglichkeiten damit noch lange nicht erschöpft waren.",
    "Sie träumte davon, dass eine solche Maschine eines Tages sogar Musik abspielen könnte, und ersann so die Idee eines modernen Computers. 1845 legte sie den ersten Algorithmus zur maschinellen Berechnung der Bernoulli-Zahlen vor und wird daher von vielen als erste Computerprogrammiererin der Welt gefeiert."
]

## Spacy

In [5]:
# Load a SpaCy model
import de_dep_news_trf
model = de_dep_news_trf.load()

# parse and shingle the dependency trees
import treesimi
%time all_stringified = treesimi.examples_to_shingles(examples, model)

# create MinHashes
import datasketch
minhash = []
for stringified in all_stringified:
    m = datasketch.MinHash(num_perm=256)
    for s in stringified:
        m.update(s)
    minhash.append(m)

# Jaccard similarities
for i, mh in enumerate(minhash):
    for j, mh in enumerate(minhash):
        if i < j:
            simi = minhash[i].jaccard(minhash[j])
            print(i, j, simi)

CPU times: user 773 ms, sys: 69 ms, total: 842 ms
Wall time: 1.12 s
0 1 0.0390625
0 2 0.0546875
1 2 0.03125


## Stanza

In [6]:
# Load a Stanza model
import stanza
model = stanza.Pipeline(
    lang='de', processors='tokenize,mwt,pos,lemma,depparse',
    tokenize_pretokenized=False)

# parse and shingle the dependency trees
import treesimi
%time all_stringified = treesimi.examples_to_shingles(examples, model)

# create MinHashes
import datasketch
minhash = []
for stringified in all_stringified:
    m = datasketch.MinHash(num_perm=256)
    for s in stringified:
        m.update(s)
    minhash.append(m)

# Jaccard similarities
for i, mh in enumerate(minhash):
    for j, mh in enumerate(minhash):
        if i < j:
            simi = minhash[i].jaccard(minhash[j])
            print(i, j, simi)

2021-12-01 22:23:07 INFO: Loading these models for language: de (German):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2021-12-01 22:23:07 INFO: Use device: cpu
2021-12-01 22:23:07 INFO: Loading: tokenize
2021-12-01 22:23:07 INFO: Loading: mwt
2021-12-01 22:23:07 INFO: Loading: pos
2021-12-01 22:23:08 INFO: Loading: lemma
2021-12-01 22:23:08 INFO: Loading: depparse
2021-12-01 22:23:09 INFO: Done loading processors!


CPU times: user 1.19 s, sys: 114 ms, total: 1.31 s
Wall time: 1.6 s
0 1 0.0703125
0 2 0.04296875
1 2 0.11328125


## Trankit

In [7]:
# Load a trankit model
import trankit
model = trankit.Pipeline(lang='german', gpu=False, cache_dir='../cache')

# parse and shingle the dependency trees
import treesimi
cfg = {'use_trunc_leaves': True, 'use_drop_nodes': False, 'use_replace_attr': False}
%time all_stringified = treesimi.examples_to_shingles(examples, model, cfg)

# create MinHashes
import datasketch
minhash = []
for stringified in all_stringified:
    m = datasketch.MinHash(num_perm=256)
    for s in stringified:
        m.update(s)
    minhash.append(m)

# Jaccard similarities
for i, mh in enumerate(minhash):
    for j, mh in enumerate(minhash):
        if i < j:
            simi = minhash[i].jaccard(minhash[j])
            print(i, j, simi)

Loading pretrained XLM-Roberta, this may take a while...
Loading tokenizer for german
Loading tagger for german
Loading multi-word expander for german
Loading lemmatizer for german
Loading NER tagger for german
Active language: german
CPU times: user 9.54 s, sys: 1.2 s, total: 10.7 s
Wall time: 10.8 s
0 1 0.0625
0 2 0.04296875
1 2 0.09375
