In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../title_maker_pro")
sys.path.append("../website")

In [2]:
import re
import stanza
from collections import Counter
import itertools
import datasets
import pickle
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
stanza.download('en')  

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 116kB [00:00, 10.6MB/s]                    
2020-05-05 00:50:35 INFO: Downloading default packages for language: en (English)...
2020-05-05 00:50:36 INFO: File exists: /home/tdimson/stanza_resources/en/default.zip.
2020-05-05 00:50:39 INFO: Finished downloading models and saved to /home/tdimson/stanza_resources.


In [3]:
def print_words(words, f):
    for word in words:
        word_str = [word.word]
        if word.pos:
            word_str.append(f"/{word.pos}/")
        if word.topic:
            word_str.append(f"[{word.topic}]")
        print(" ".join(word_str), file=f)
        print(f"\t{word.definition}{' |n| ' if word.example is None else ''}", file=f)
        if word.example:
            print(f"\t\"{word.example}\"{' |e|' if word.from_example_expansion else ''}", file=f)

        print("", file=f)

In [4]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens(datasets.SpecialTokens.special_tokens_dict())
blacklist = datasets.Blacklist.load("../build/blacklist.pickle")
model = AutoModelWithLMHead.from_pretrained("/mnt/evo/projects/title-maker-pro/models/en_dictionary_parsed_lr_00001_creativity/checkpoint-120000/").to("cuda:0")

2020-05-05 00:50:39 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |

2020-05-05 00:50:39 INFO: Use device: gpu
2020-05-05 00:50:39 INFO: Loading: tokenize
2020-05-05 00:50:40 INFO: Loading: pos
2020-05-05 00:50:41 INFO: Done loading processors!


In [None]:
%timeit datasets.ParsedDictionaryDefinitionDataset.evaluate_creativity(tokenizer, model, blacklist, 100, 50, max_length=512)

In [5]:
def no_weird(w):
    return (
            w.word[-1] != "-"
            and "<|" not in w.definition
            and "<|" not in w.example
            and (not w.pos or "<|" not in w.pos)
            and len(w.word.split()) <= 3
            and len(w.definition.split()) >= 3
            and len(w.example.split()) >= 3
        )
         
def go(**kwargs):
    return datasets.ParsedDictionaryDefinitionDataset.generate_words(
        tokenizer, model,
        num=20000,
        max_iterations=1000000, 
        blacklist=blacklist, 
        do_example_expansion=False,
        example_match_pos_pipeline=nlp,
        generation_args=dict(
            top_k=200,
            num_return_sequences=50,
            max_length=375,
            do_sample=True,
        ),
        expansion_generation_overrides=dict(
            top_k=50,
            num_return_sequences=25,
            do_sample=True,
        ),
        num_expansion_candidates=25,
        filter_proper_nouns=True,
        user_filter=no_weird,
        **kwargs
    )

# words, stats = go()
# print(stats)
# print()
# print_words(words, sys.stdout)

In [6]:
words, stats = go(use_custom_generate=True)

100%|██████████| 20000/20000 [2:39:57<00:00,  1.63it/s]

In [None]:
blacklist.contains("foolage")

In [None]:
len(blacklist.blacklist_set)

In [7]:
print_words(words[:50], sys.stdout)

carolier /noun/ [British]
	a kind of fast-flowing lace dress suitable for dancing
	"a stylish, elegant carolier"

arbitrariff /noun/
	a lawyer involved in the arbitrariff's work
	"the arbitrariff decided that the law should be binding"

penarium /noun/
	a central compartment of an insect, reptile, or other living organism under the wings at the base of the thorax
	"the wings were enlarged, forcing the embryo to fly into penarium"

undergradualization /noun/
	the slow development of a technique, resulting from an abnormal development
	"excessive undergradualization of the techniques"

unreimbursed /adjective/ [archaic]
	(of a book or index) having a story or descriptive narrative that is not easily recounted
	"no readers loved stories without unreimbursed evidence"

inchester /noun/
	(in an architecture school) a chapel with rectangular, molding-like moldings made of varnish, typically used for religious or decorative decoration
	"a wall set was built in the Inchester of Joochwood"

bez

In [None]:
import math
from transformers import activations
import transformers

def gelu_new(x):
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

activations.ACT2FN['gelu_new'] = gelu_new

model = AutoModelWithLMHead.from_pretrained("../build/forward-dictionary-model-v1").to("cpu")
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear, torch.nn.Embedding, transformers.modeling_utils.Conv1D}, dtype=torch.qint8
)

In [None]:
a = go2()

In [None]:
print(tokenizer.decode(a[0]))

In [None]:
%timeit go2()

In [10]:
from words import WordIndex, Word
def clean_example(w, example):
    return re.sub(re.escape(w), w, example, flags=re.IGNORECASE)

In [11]:
from hyphen import Hyphenator
h_en = Hyphenator('en_US')

wi = WordIndex(
    [
        Word(
            word=w.word,
            definition=w.definition,
            pos=w.pos,
            topic=w.topic,
            example=clean_example(w.word, w.example),
            syllables=h_en.syllables(w.word),
        ) for w in words
        
    ]
)
wi.dump("../website/data/words2.json")

In [None]:

h_en.syllables('fancccwe')
wi2 = WordIndex.load("../website/data/words.json")
wi_p = WordIndex(
    [
        Word(
            word=w.word,
            definition=w.definition,
            pos=w.pos,
            topic=w.topic,
            example=clean_example(w.word, w.example),
            syllables=h_en.syllables(w.word)
        )
        for w in wi2.words
    ]
)

In [None]:
wi_p.dump("../website/data/words.json")