# Generate Interesting Improv bot suggestions

In [55]:
from tqdm.notebook import trange, tqdm
import itertools
import more_itertools
import sys
from collections import defaultdict

In [36]:
def load_word_list(filename):
    with open(filename) as f:
        return [line.strip() for line in f]

words = load_word_list('google-10000-english-usa-no-swears-medium.txt')
words = words + load_word_list('google-10000-english-usa-no-swears-long.txt')
print('Loaded {} words'.format(len(words)))
    

Loaded 7700 words


## Load Spacy for NLP
This part can be slow, we'll load spacy and do part of speech tagging on all the words.

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")           # load model package "en_core_web_sm"


In [37]:
tagged_words = []
for word in tqdm(words):
    doc = nlp(word)
    # part of speech is most general, tag is more specific
    tagged_words.append([word, doc[0].pos_, doc[0].tag_])

  0%|          | 0/7700 [00:00<?, ?it/s]

In [58]:
# Load POS tags for reading
pos_descriptions = defaultdict(str)
with open('pos_tags.txt') as f:
    for line in f:
        parts = line.split("\t")
        if len(parts) == 4:
            pos_descriptions[parts[0]] = parts[3].strip()

In [60]:
# Output words in each POS so we can see which ones we like
def clamp(n, smallest, largest): return max(smallest, min(n, largest))

words_by_pos = sorted(tagged_words, key=lambda w: (w[1], w[2]))
by_pos = itertools.groupby(words_by_pos, lambda w: (w[1], w[2]))
materialized_by_pos = [(group, list(items)) for group, items in by_pos]
                        
for group, items in sorted(materialized_by_pos, key=lambda p: -len(p[1])):
    itemlist = list(items)
    print('Group: ' + group[0] + ' - ' + group[1] + ' (' + pos_descriptions[group[1]] + ') ' + str(len(itemlist)))
    
    for item in more_itertools.take(40, itemlist):
        sys.stdout.write(item[0])
        sys.stdout.write(' ')
    sys.stdout.write("\n\n")

sys.stdout.flush()

Group: PROPN - NNP (noun, proper singular) 2134
health world january united center store national reserved south house county american download north white account digital reply december canada english windows thread october november series model forums march yahoo server april street standard mobile party login social august america 

Group: NOUN - NN (noun, singular or mass) 1804
search contact business service price state email music product system policy number support message software video school review order privacy company group research program hotel travel report member office design internet address phone shipping forum family website index today project 

Group: NOUN - NNS (noun, plural) 1385
services people products rights books links years items reviews games comments details terms hotels results states prices women pages sports members systems posts media pictures children students times sites events hours tools movies sales photos features articles issues users things 
