# Generate Interesting Improv bot suggestions

In [114]:
from tqdm.notebook import trange, tqdm
import itertools
import more_itertools
import sys
from collections import defaultdict

In [161]:
def load_word_list(filename):
    with open(filename) as f:
        return [(line.strip(), idx) for idx, line in enumerate(f)]

words = []
words = words + load_word_list('google-10000-english-usa-no-swears-medium.txt')[:2000]
words = words + load_word_list('google-10000-english-usa-no-swears-long.txt')[:1500]
#words = words + load_word_list('google-10000-english-usa-no-swears-short.txt') # note: these are quite boring
print('Loaded {} words'.format(len(words)))
    

Loaded 3500 words


## Load Spacy for NLP
This part can be slow, we'll load spacy and do part of speech tagging on all the words.

In [116]:
import spacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 

nlp = spacy.load("en_core_web_sm")           # load model package "en_core_web_sm"
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [163]:
tagged_words = []
for word, rank in tqdm(words):
    doc = nlp(word)
    # part of speech is most general, tag is more specific
    tagged_words.append([word, doc[0].pos_, doc[0].tag_, rank])


  0%|          | 0/3500 [00:00<?, ?it/s]

In [164]:
# Load POS tags for reading
pos_descriptions = defaultdict(str)
with open('pos_tags.txt') as f:
    for line in f:
        parts = line.split("\t")
        if len(parts) == 4:
            pos_descriptions[parts[0]] = parts[3].strip()

In [165]:
# Output words in each POS so we can see which ones we like
def clamp(n, smallest, largest): return max(smallest, min(n, largest))

words_by_pos = sorted(tagged_words, key=lambda w: (w[1], w[2]))
by_pos = itertools.groupby(words_by_pos, lambda w: (w[1], w[2]))
materialized_by_pos = [(group, list(items)) for group, items in by_pos]
                        
for group, items in sorted(materialized_by_pos, key=lambda p: -len(p[1])):
    itemlist = list(items)
    print('Group: ' + group[0] + ' - ' + group[1] + ' (' + pos_descriptions[group[1]] + ') ' + str(len(itemlist)))
    
    for item in more_itertools.take(40, itemlist):
        sys.stdout.write(item[0])
        sys.stdout.write(' ')
    sys.stdout.write("\n\n")

sys.stdout.flush()

Group: NOUN - NN (noun, singular or mass) 1030
search contact business service price state email music product system policy number support message software video school review order privacy company group research program hotel travel report member office design internet address phone shipping forum family website index today project 

Group: PROPN - NNP (noun, proper singular) 678
health world january united center store national reserved south house county american download north white account digital reply december canada english windows thread october november series model forums march yahoo server april street standard mobile party login social august america 

Group: NOUN - NNS (noun, plural) 653
services people products rights books links years items reviews games comments details terms hotels results states prices women pages sports members systems posts media pictures children students times sites events hours tools movies sales photos features articles issues users things 

G

In [166]:
# Filter by POS
filtered_words = [w for w in tagged_words if w[1] == 'NOUN' and w[2] == 'NN']


In [169]:
# Trying it out
import random
filtered_words[random.randint(0, len(filtered_words) - 1)]


['aircraft', 'NOUN', 'NN', 1743]

In [174]:
# Export to file
text = ", ".join(['"{}"'.format(w[0]) for w in filtered_words])
with open('suggies.txt', 'w') as f:
    f.write(text)
print("Wrote {} suggestions".format(str(len(filtered_words))))

Wrote 1030 suggestions
