In [4]:
import datrie
from collections import defaultdict

In [5]:
def invert_counts(d, sort=True, reverse=True):
    i = {}
    for (k, v) in d.items():
        if k[0] == '_' or k == 'trait_names':
            continue
        i.setdefault(v, []).append(k)
    if not sort:
        return i
    else:
        keys = [ k for k in sorted(i.keys(), reverse=reverse) ]
        return [ (key, value) for key in keys for value in i[key] ]

class Stats(defaultdict):
    def __init__(self, typename=int):
        defaultdict.__init__(self, typename)

    def __getattr__(self, name):
        return self.__getitem__(name)

    def __setattr__(self, name, value):
        return self.__setitem__(name, value)

    def keys(self):
        return [
            k for k in defaultdict.keys(self)
                if k[0] != '_' and k != 'trait_names'
        ]

    def _to_dict(self):
        return { k: self[k] for k in self.keys() }

    def _to_json(self):
        return json.dumps(self)

    def _save(self, path):
        with open(path, 'w') as f:
            json.dump(f, self)

    def _invert(self):
        return invert_counts(self)

class KeyedStats(Stats):
    def __init__(self):
        Stats.__init__(self, typename=lambda: Stats())

    def _invert(self):
        return { k: self[k]._invert() for k in self.keys() }

In [6]:
from tqdm import tqdm

In [16]:
import string
import pickle

In [8]:
allowed = (
    string.printable +
    string.punctuation
)

In [9]:
pagecounts_filename = 'en-pagecounts-20150205-150000.txt'

In [10]:
#num_lines = !wc -l en-pagecounts-20150205-150000.txt
num_lines = 823008

In [11]:
freq_titles_trie = datrie.BaseTrie(allowed)
freq_stats = Stats()

In [12]:
duplicates = 0
max_failed = 10
failed = []
def process(trie, stats, filename):
    with open(filename, 'rb') as f:
        for bytes_line in tqdm(f, total=num_lines, leave=True):
            try:
                line = bytes_line.decode('utf-8', 'replace')
                (title, freq) = line.split('\t')
                freq = int(freq)
                stats[freq] += 1
                if title in trie:
                    if trie[title] > 1:
                        duplicates += 1
                        continue
                trie[title] = freq

                lower_title = title.lower()
                if lower_title not in trie:
                    trie[lower_title] = -freq
            except Exception as e:
                failed.append((e, line))
                if len(failed) >= max_failed:
                    raise

In [13]:
process(freq_titles_trie, freq_stats, pagecounts_filename)

|##########| 823008/823008 100% [elapsed: 04:01 left: 00:00, 3403.75 iters/sec]

In [14]:
len(freq_titles_trie)

1629361

In [15]:
freq_titles_trie.save('freq_titles.trie')

In [19]:
t = freq_titles_trie

In [23]:
ft1 = datrie.Trie(allowed)

In [24]:
for c in tqdm(allowed, total=len(allowed), leave=True):
    if c not in t:
        continue
    l = t.items(c)
    l.sort(key=lambda e: e[1], reverse=True)
    ft1[c] = l

|##########| 132/132 100% [elapsed: 00:03 left: 00:00, 40.65 iters/sec]

In [25]:
ft1.save('freq_titles_1c.trie')

In [26]:
ft2 = datrie.Trie(allowed)

In [27]:
for c1 in tqdm(allowed, total=len(allowed), leave=True):
    for c2 in allowed:
        k = c1 + c2
        if k not in t:
            continue
        l = t.items(k)
        l.sort(key=lambda e: e[1], reverse=True)
        ft2[k] = l
        

|##########| 132/132 100% [elapsed: 00:02 left: 00:00, 50.13 iters/sec]

In [28]:
len(ft2)

1350

In [29]:
ft2.save('freq_titles_2c.trie')

In [30]:
ft3 = datrie.Trie(allowed)

In [31]:
for c1 in tqdm(allowed, total=len(allowed), leave=True):
    for c2 in allowed:
        for c3 in allowed:
            k = ''.join((c1, c2, c3))
            if k not in t:
                continue
            l = t.items(k)
            l.sort(key=lambda e: e[1], reverse=True)
            ft3[k] = l


|##########| 132/132 100% [elapsed: 00:02 left: 00:00, 47.69 iters/sec]

In [32]:
ft3.save('freq_titles_3c.trie')

In [33]:
ft4 = datrie.Trie(allowed)
for c1 in tqdm(allowed, total=len(allowed), leave=True):
    for c2 in allowed:
        for c3 in allowed:
            for c4 in allowed:
                k = ''.join((c1, c2, c3, c4))
                if k not in t:
                    continue
                l = t.items(k)
                l.sort(key=lambda e: e[1], reverse=True)
                ft4[k] = l



|##########| 132/132 100% [elapsed: 01:41 left: 00:00,  1.30 iters/sec]

In [34]:
ft4.save('freq_titles_4c.trie')

In [36]:
keys = ft4.keys()

In [37]:
len(keys)

16969

In [38]:
keys.sort()

In [39]:
ft5 = datrie.Trie(allowed)
for c1c2c3c4 in tqdm(keys, total=len(keys), leave=True):
    for c5 in allowed:
        k = c1c2c3c4 + c5
        if k not in t:
            continue
        l = t.items(k)
        l.sort(key=lambda e: e[1], reverse=True)
        ft5[k] = l


|##########| 16969/16969 100% [elapsed: 00:00 left: 00:00, 18033.00 iters/sec]

In [41]:
ft5.save('freq_titles_5c.trie')

In [45]:
keys = ft5.keys()

In [47]:
ft6 = datrie.Trie(allowed)
for c1c2c3c4c5 in tqdm(keys, total=len(keys), leave=True):
    for c6 in allowed:
        k = c1c2c3c4c5 + c6
        if k not in t:
            continue
        l = t.items(k)
        l.sort(key=lambda e: e[1], reverse=True)
        ft6[k] = l


|##########| 4691/4691 100% [elapsed: 00:00 left: 00:00, 20305.96 iters/sec]

In [49]:
ft6.save('freq_titles_6c.trie')

In [50]:
len(ft2)

1350

In [51]:
len(ft1)

72

In [52]:
ft1['!']

[('!!!', 18),
 ('!!Destroy-Oh-Boy!!', 6),
 ('!!!Fuck You!!! and Then Some', 5),
 ('! (disambiguation)', 3),
 ('!', 2),
 ('!!! (album)', 2),
 ('!!!fuck you!!! and then some', -5),
 ('!!destroy-oh-boy!!', -6)]

In [54]:
ft4['IOCP']

KeyError: 'IOCP'