#Jaccard Similarities between Dependency Trees (using spacy)
Siehe https://github.com/satzbeleg/treesimi



In [1]:
%%capture
!pip install "treesimi>=0.2.0" datasketch

In [2]:
%%capture
!pip install -U pip setuptools wheel 
!pip install datasketch
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [3]:
import datasketch
import json
import treesimi 
import spacy

# Load a SpaCy model
model = spacy.load("de_core_news_sm", disable=['morphologizer', 'attribute_ruler', 'ner', 'lemmatizer'])



In [4]:
examples = [
    "Das Kind trug den Holzklotz den halben Nachmittag im Wohnzimmer herum.",
    "Die Maus lief mit dem Käsestück den ganzen Winter in der Garage herum.",
    "Der Holzklotz wurde vom Kind nachmittags im Wohnzimmer herumgetragen."
]

In [5]:
cfg = {
    'use_trunc_leaves': True, 
    'use_drop_nodes': False, 
    'use_replace_attr': False
}

In [7]:
# parse and shingle the dependency trees
all_stringified = treesimi.examples_to_shingles(examples, model, cfg=cfg)

# create MinHashes
minhash = []
for stringified in all_stringified:
    m = datasketch.MinHash(num_perm=256)
    for s in stringified:
        m.update(s)
    minhash.append(m)

# Jaccard similarities
for i, mh in enumerate(minhash):
    for j, mh in enumerate(minhash):
        if i < j:
            simi = minhash[i].jaccard(minhash[j])
            print(f"({i}, {j}): {simi}")

# Jaccard similarities immer Vergleich zum 1.Satzbeispiel
# for i in range(len(minhash)):
#     print(minhash[0].jaccard(minhash[i]))

(0, 1): 0.33203125
(0, 2): 0.19921875
(1, 2): 0.16796875
