# Create subtrees as shingle sets

In [1]:
import sys
sys.path.append("..")

In [2]:
import conllu
import treesimi as ts
import copy
import json

## Load Dataset

In [3]:
%%capture
!mkdir "../data"
!wget -O "../data/de_hdt-ud-dev.conllu" "https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/master/de_hdt-ud-dev.conllu"

In [4]:
dat = conllu.parse(open("../data/de_hdt-ud-dev.conllu").read())
len(dat)

18434

## Convert Adjacency List to Nested Set Table

In [5]:
# adjacancy list model
j = 1
adjac = [(t['id'], t['head'], t['deprel']) for t in dat[j]]
adjac

[(1, 11, 'advcl'),
 (2, 3, 'case'),
 (3, 1, 'obl'),
 (4, 7, 'case'),
 (5, 7, 'det'),
 (6, 7, 'amod'),
 (7, 3, 'nmod'),
 (8, 9, 'case'),
 (9, 7, 'nmod'),
 (10, 1, 'punct'),
 (11, 0, 'root'),
 (12, 13, 'det'),
 (13, 11, 'nsubj'),
 (14, 15, 'det'),
 (15, 13, 'nmod'),
 (16, 17, 'det'),
 (17, 11, 'obj'),
 (18, 11, 'compound:prt'),
 (19, 11, 'punct')]

In [6]:
# convert to nested set mode
nested = ts.adjac_to_nested_with_attr(adjac)
nested = ts.remove_node_ids(nested)
nested

[[1, 38, 0, 'root'],
 [2, 21, 1, 'advcl'],
 [3, 18, 2, 'obl'],
 [4, 5, 3, 'case'],
 [6, 17, 3, 'nmod'],
 [7, 8, 4, 'case'],
 [9, 10, 4, 'det'],
 [11, 12, 4, 'amod'],
 [13, 16, 4, 'nmod'],
 [14, 15, 5, 'case'],
 [19, 20, 2, 'punct'],
 [22, 29, 1, 'nsubj'],
 [23, 24, 2, 'det'],
 [25, 28, 2, 'nmod'],
 [26, 27, 3, 'det'],
 [30, 33, 1, 'obj'],
 [31, 32, 2, 'det'],
 [34, 35, 1, 'compound:prt'],
 [36, 37, 1, 'punct']]

## Extract Subtrees
The code below is wrapped into the `treesimi.shingleset` function.

In [7]:
# Extract full subtrees
trees = ts.extract_subtrees(nested)
trees.append(nested)  # add original tree
trees = ts.unique_trees(trees)
print(f"#num subtrees: {len(trees)}")

# Truncate leaves
for tmp in copy.deepcopy(trees):
    trees.extend(ts.trunc_leaves(tmp))

trees = ts.unique_trees(trees)
print(f"#num subtrees: {len(trees)}")

# Drop nodes
for tmp in copy.deepcopy(trees):
    trees.extend(ts.drop_nodes(tmp))

trees = ts.unique_trees(trees)
print(f"#num subtrees: {len(trees)}")

# Mask data attributes
for tmp in copy.deepcopy(trees):
    trees.extend(ts.replace_attr(tmp, placeholder='[MASK]'))

trees = ts.unique_trees(trees)
print(f"#num subtrees: {len(trees)}")

#num subtrees: 13
#num subtrees: 24
#num subtrees: 118
#num subtrees: 1204


## Create Hashable Shingles from Subtrees

In [8]:
stringified = [json.dumps(tree).encode('utf-8') for tree in trees]
len(stringified)

1204