In [1]:
%load_ext autoreload
%autoreload 2
import sys
from collections import defaultdict
import pandas as pd
import pickle
import re
import itertools 
sys.path.append("../")
sys.path.append("../title_maker_pro")
from title_maker_pro import datasets
from collections import OrderedDict
import torch
from transformers import AutoModelWithLMHead, AutoTokenizer
import copy

In [None]:
dataset_path = "/mnt/evo/projects/title-maker-pro/data/urban_dictionary_words.pickle"
with open(dataset_path, 'rb') as f:
    dataset = pickle.load(f)

In [None]:
blacklist = datasets.Blacklist.load("/mnt/evo/projects/title-maker-pro/models/blacklist.pickle")

In [None]:
pd_dataset = pd.DataFrame(
    (
        (d.word, d.meaning, d.examples[0], d.upvotes, d.downvotes, d.creation_epoch) 
        for d in itertools.chain.from_iterable(e.definitions for e in dataset.values())
    ),
    columns=["word", "meaning", "example", "upvotes", "downvotes", "creation_epoch"]
)

In [None]:
def cut(original, f, name):
    n = original[f]
    print(f"{name} cut by {100 * len(n) / (len(original)):.2f}% ({len(original)} -> {len(n)})")
    return n

In [None]:
t = pd_dataset.copy()
t = cut(t, ~(pd_dataset["word"].apply(blacklist.contains)), name="blacklist")
t = cut(t, ((pd_dataset["example"].str.len() + pd_dataset["meaning"].str.len() + pd_dataset["word"].str.len()) < 250), name="length")
t = cut(t, (pd_dataset["upvotes"] >= 4), name="upvotes")

In [None]:
cleaned_dataset = OrderedDict()
i = 0
num_defns = 0
for k, ud_word in dataset.items():
    good_defns = []
    for d in ud_word.definitions:
        if i in valid_indexes:
            good_defns.append(d)
            num_defns += 1
        i += 1
    
    if good_defns:
        new = copy.deepcopy(ud_word)
        new.definitions = good_defns
        cleaned_dataset[k] = new

In [None]:
cleaned_dataset_path = "/mnt/evo/projects/title-maker-pro/data/urban_dictionary_250_cleaned.pickle"
with open(cleaned_dataset_path, "wb") as f:
    pickle.dump(cleaned_dataset, f, pickle.HIGHEST_PROTOCOL)

In [2]:
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos', use)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens(datasets.SpecialTokens.special_tokens_dict())
blacklist = datasets.Blacklist.load("/mnt/evo/projects/title-maker-pro/models/blacklist_urban_dictionary.pickle")
model = AutoModelWithLMHead.from_pretrained("/mnt/evo/projects/title-maker-pro/models/urban_dictionary_250_cleaned_lr_00005_b9_seed4/checkpoint-140000").to("cuda:0")

In [3]:
words, stats = datasets.UrbanDictionaryDataset.generate_words(
    tokenizer, model,
    num=10000,
    max_iterations=500, 
    blacklist=blacklist, 
    generation_args=dict(
        top_k=200,
        num_return_sequences=250,
        max_length=250,
        do_sample=True,
    ),
    dedupe_titles=True,
    # filter_proper_nouns=True,
    min_definition_words=3,
)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

In [11]:
import os
from title_maker_pro.bad_words import ULTRA_BAD_REGEX
from website.words import WordIndex, Word
from word_service.word_service_proto import wordservice_pb2

def clean_example(w, example):
    return re.sub(re.escape(w), w, example, flags=re.IGNORECASE)

def word_filter(words):
    filters = defaultdict(int)
    ret = []
    def run_over_all_text(pat, word):
        return (
            re.search(pat, word.word.strip(), flags=re.IGNORECASE) 
            or re.search(pat, word.definition.strip(), flags=re.IGNORECASE) 
            or re.search(pat, word.example.strip(), flags=re.IGNORECASE)
        )
    
    for word in words:
        if re.search(r"(^|\b)nig+", word.word.strip()):
            filters["nig"] += 1
        elif re.search(r"(^|\b)mex+", word.word.strip()):
            filters["mex"] += 1
        elif run_over_all_text(r"(\b|^)fagg+ots*", word):
            filters["fggot"] += 1
        elif run_over_all_text(r"(\b|^)f+a+g+", word):
            filters["fg"] += 1
        elif run_over_all_text(r"ghettos?", word):
            filters["ghetto"] += 1
        elif run_over_all_text(r"skanks*", word):
            filters["sknk"] += 1
        elif run_over_all_text(r"(^|\b)p+a+k+i+(\b|$)", word):
            filters["pki"]
        elif run_over_all_text(r"(^|\b)cunt+", word):
            filters["cnt"] += 1
        elif run_over_all_text(r"(^|\b)indian($|\b)", word):
            filters['indian'] += 1
        elif run_over_all_text(r"c+h+i+n+k+", word):
            filters['chnk'] += 1
        elif run_over_all_text(r"nigga+s*", word):
            filters['ngga'] += 1
        elif run_over_all_text(r"(^|\b)slap+s*(^|\b)", word):
            filters['slap'] += 1
        elif run_over_all_text(r"(^|\b)r+a+p+e+s*(^|\b)", word):
            filters['rape'] += 1
        elif ULTRA_BAD_REGEX.search(word.word.strip()):
            filters["ultra_bad_word"] += 1
        elif ULTRA_BAD_REGEX.search(word.definition.strip()):
            filters["ultra_bad_def"] += 1
        elif ULTRA_BAD_REGEX.search(word.example.strip()):
            filters["ultra_bad_example"] += 1
        else:
            ret.append(word)
            
    for k,v in sorted(filters.items()):
        print(f"Filter '{k}' removed {100 * v / len(words):.2f}%")
        
    print(f"Total removed {100 * (1 - len(ret) / len(words)):.2f}%")
        
    return ret
        

from hyphen import Hyphenator
h_en = Hyphenator('en_US')

wi = WordIndex(
    [
        Word(
            word=w.word,
            definition=w.definition,
            pos=w.pos,
            topic=w.topic,
            example=clean_example(w.word, w.example),
            syllables=h_en.syllables(w.word),
            probably_exists=False,
            dataset_type=wordservice_pb2.DatasetType.UD_UNFILTERED,
            
        ) for w in word_filter(words)
        
    ]
)
wi.dump_encrypted("../website/data/words_ud_filtered.enc.gz", fernet_key=os.environ.get("FERNET_ENCRYPTION_KEY"))

Filter 'chnk' removed 0.08%
Filter 'cnt' removed 1.31%
Filter 'fg' removed 0.80%
Filter 'fggot' removed 0.75%
Filter 'ghetto' removed 0.57%
Filter 'indian' removed 0.18%
Filter 'mex' removed 0.15%
Filter 'ngga' removed 1.25%
Filter 'nig' removed 0.42%
Filter 'pki' removed 0.00%
Filter 'rape' removed 0.11%
Filter 'sknk' removed 0.32%
Filter 'slap' removed 0.21%
Filter 'ultra_bad_def' removed 0.24%
Filter 'ultra_bad_example' removed 0.03%
Filter 'ultra_bad_word' removed 0.14%
Total removed 6.58%


2

In [None]:
from word_service.word_service_proto import wordservice_pb