In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../title_maker_pro")
sys.path.append("../website")

In [2]:
import re
import stanza
from collections import Counter
import itertools
import datasets
import pickle
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
stanza.download('en')  

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 116kB [00:00, 7.96MB/s]                    
2020-05-04 20:47:49 INFO: Downloading default packages for language: en (English)...
2020-05-04 20:47:50 INFO: File exists: /home/tdimson/stanza_resources/en/default.zip.
2020-05-04 20:47:53 INFO: Finished downloading models and saved to /home/tdimson/stanza_resources.


In [11]:
def print_words(words, f):
    for word in words:
        word_str = [word.word]
        if word.pos:
            word_str.append(f"/{word.pos}/")
        if word.topic:
            word_str.append(f"[{word.topic}]")
        print(" ".join(word_str), file=f)
        print(f"\t{word.definition}{' |n| ' if word.example is None else ''}", file=f)
        if word.example:
            print(f"\t\"{word.example}\"{' |e|' if word.from_example_expansion else ''}", file=f)

        print("", file=f)

In [3]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens(datasets.SpecialTokens.special_tokens_dict())
blacklist = datasets.Blacklist.load("../build/blacklist.pickle")
model = AutoModelWithLMHead.from_pretrained("/mnt/evo/projects/title-maker-pro/models/en_dictionary_parsed_lr_00001_creativity/checkpoint-120000/").to("cuda:0")

2020-05-04 11:52:01 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |

2020-05-04 11:52:01 INFO: Use device: gpu
2020-05-04 11:52:01 INFO: Loading: tokenize
2020-05-04 11:52:03 INFO: Loading: pos
2020-05-04 11:52:03 INFO: Done loading processors!


In [None]:
%timeit datasets.ParsedDictionaryDefinitionDataset.evaluate_creativity(tokenizer, model, blacklist, 100, 50, max_length=512)

In [13]:
def no_weird(w):
    return (
            w.word[-1] != "-"
            and "<|" not in w.definition
            and "<|" not in w.example
            and (not w.pos or "<|" not in w.pos)
            and len(w.word.split()) <= 3
        )
         

with torch.no_grad():
    words, stats = datasets.ParsedDictionaryDefinitionDataset.generate_words(
        tokenizer, model,
        num=5,
        max_iterations=10000, 
        blacklist=blacklist, 
        do_example_expansion=False, 
        generation_args=dict(
            top_k=200,
            num_return_sequences=100,
            max_length=375,
            do_sample=True,
        ),
        expansion_generation_overrides=dict(
            top_k=50,
            num_return_sequences=25,
            do_sample=True,
        ),
        num_expansion_candidates=25,
        filter_proper_nouns=True,
        user_filter=no_weird,
        example_match_pos_pipeline=nlp,
    )

print(stats)
print()
# print_words(words, sys.stdout)


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:09,  2.49s/it][A
 80%|████████  | 4/5 [00:06<00:02,  2.12s/it][A

iterations=5 | items_considered 1.00@494, failed_match 0.00@0, blacklist_filtered 0.87@431, seen_filtered 0.00@0, proper_noun_filtered 0.03@16, example_filtered 0.09@42, example_expansions 0.00@0, example_expansion_success 0.00@0, example_expansion_hail_maries 0.00@0, user_filtered 0.00@0, returned 0.01@5



In [14]:
print_words(words[:100], sys.stdout)

poppery /adjective/
	giving (someone) no pleasure or pleasure
	"the children shrieked poppery laughter"

transclude /verb [with object]/
	cause (a place) to become separated from public or commercial resources; disperse
	"cocaine had been adopted entirely as an illegal drug, even completely Transcluded"

prist /verb/
	strike or lay asunder one's leg down
	"his badly injured leg was to Prist for surgery"

seebout /noun/
	an instance of leaving behind something attractive or elegant
	"a seebout show"

resculpture /noun/
	a series of thin struts forming a roof on which food, drink, or other liquid is preserved
	"a sauce with lots of fresh vineyards and resculpture"



In [None]:
import math
from transformers import activations
import transformers

def gelu_new(x):
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

activations.ACT2FN['gelu_new'] = gelu_new

model = AutoModelWithLMHead.from_pretrained("../build/forward-dictionary-model-v1").to("cpu")
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear, torch.nn.Embedding, transformers.modeling_utils.Conv1D}, dtype=torch.qint8
)

In [None]:
a = go2()

In [None]:
print(tokenizer.decode(a[0]))

In [None]:
%timeit go2()

In [3]:
from words import WordIndex, Word
def clean_example(w, example):
    return re.sub(w, w, example, flags=re.IGNORECASE)

In [10]:

    
wi = WordIndex(
    [
        Word(
            word=w.word,
            definition=w.definition,
            pos=w.pos,
            topic=w.topic,
            example=clean_example(w.word, w.example),
        ) for w in words
    ]
)
wi.dump("../website/data/words2.json")

In [6]:
from hyphen import Hyphenator
h_en = Hyphenator('en_US')
h_en.syllables('fancccwe')
wi2 = WordIndex.load("../website/data/words.json")
wi_p = WordIndex(
    [
        Word(
            word=w.word,
            definition=w.definition,
            pos=w.pos,
            topic=w.topic,
            example=clean_example(w.word, w.example),
            syllables=h_en.syllables(w.word)
        )
        for w in wi2.words
    ]
)

In [8]:
wi_p.dump("../website/data/words.json")