In [1]:
import sys
sys.path.append("..")

In [2]:
from collections import Counter
import kshingle as ks
import itertools

### Download and read text data

In [3]:
%%capture
!wget -nc --no-check-certificate https://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_newscrawl-public_2019_10K.tar.gz
!tar -xvf deu_newscrawl-public_2019_10K.tar.gz
!mv deu_newscrawl-public_2019_10K/deu_newscrawl-public_2019_10K-sentences.txt sentences.txt
!rm -rf deu_newscrawl-public_2019_10K
!rm deu_newscrawl-public_2019_10K.tar.gz

In [4]:
with open("sentences.txt", "r") as fp:
    dat = fp.readlines()

dat = dat[:1000]
text = " ".join([s.split('\t')[1].split('\n')[0] for s in dat])

dat = dat[:50]
text2 = " ".join([s.split('\t')[1].split('\n')[0] for s in dat])

### Select shingles with CEWS

In [5]:
%%time
db = Counter()
for doc in [text2]:
    # extract all shingles of different k-length (no wildcards!)
    shingles = ks.shingleseqs_k(doc, k=5)  # bump it up to 8
    # count all unique shingles, and add the result
    db += Counter(itertools.chain(*shingles))

db = dict(db)
print(len(db))

8638
CPU times: user 21.2 ms, sys: 3.93 ms, total: 25.2 ms
Wall time: 24.3 ms


In [6]:
%%time
memo = ks.cews(
    db, max_wildcards=1, 
    min_samples_leaf=10, 
    threshold=0.9)


CPU times: user 2min 48s, sys: 1.1 s, total: 2min 49s
Wall time: 3min 1s


### Encode with hashing
Convert selected shingles to hashes

In [7]:
%%time
HASHES = ks.shingles_to_hashes(memo, wildcard='\uFFFF')
unkid = sum([len(hashes) for hashes in HASHES.values()])

CPU times: user 8.61 ms, sys: 1.23 ms, total: 9.83 ms
Wall time: 9.23 ms


Generate all shingles

In [8]:
%%time
multiseq = ks.shingleseqs_hashes(text, k=5)

CPU times: user 34.1 s, sys: 747 ms, total: 34.8 s
Wall time: 35.6 s


Encode data

In [9]:
%%time
allseqs = ks.encode_multi_match(
    multiseq, num_matches=3, HASHES=HASHES, unkid=unkid)

CPU times: user 37.2 s, sys: 428 ms, total: 37.6 s
Wall time: 39.8 s


strings Wall time: 10.1 s + 33.8 s

digest Wall time: 34.7 s + 32.1 s

hexdigest Wall time: 35.6 s + 39.8 s

### Encode with patterns (deprecated)

In [10]:
%%time
PATTERNS = ks.shingles_to_patterns(memo, wildcard='\uFFFF')
unkid = sum([len(pats) for pats in PATTERNS.values()])

CPU times: user 90 ms, sys: 5.86 ms, total: 95.9 ms
Wall time: 123 ms


  """Entry point for launching an IPython kernel.


Encode data

In [11]:
%%time
encoded = ks.encode_multi_match_corpus(
    [text], k=5, PATTERNS=PATTERNS, num_matches=3, unkid=unkid)

  


CPU times: user 32.7 s, sys: 172 ms, total: 32.9 s
Wall time: 33 s
