In [19]:
str = """Dear all, our next All-hands meeting will be Friday, April 26th, 11-1 (Pacific), 
and will be focused on developing a prioritized list of research questions that the summer 
camp will use topic modeling to address. (Please also see my debriefing report from our 
last all-hands meeting, which includes other ongoing modeled tasks.)

Important: In advance of the meeting, can each of the collection teams (and others who are 
interested) please use this Google sheet to suggest high-priority research questions that 
WE1S can begin to work on this summer? We will discuss and prioritize these at our meeting.

Please see the links in the sheet to a definition of a well-formed question, and also to 
previously suggested questions (which can be put on the new sheet if teams feel they are 
important enough). Also, for context, see what we promised as outcomes for the Mellon grant."""

import re
str = re.sub('\s+', ' ', str)

import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')
doc = nlp(str)
# all tokens that arent stop words or punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]

# noun tokens that arent stop words or punctuations
nouns = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN"]

# Bagify!
word_freq = Counter(words)
print('Bag of Words in a Dict')
print(dict(word_freq))
print('\n')

# Fun Stuff
print('All words except stop words')
print(words)
print('\n')

print('All nouns except stop words')
print(nouns)
print('\n')

print('Five most common words')
print(word_freq.most_common(5))
print('\n')

print('Noun frequencies')
noun_freq = Counter(nouns)
print(dict(noun_freq))
print('\n')
print('Five most common nouns')

print(noun_freq.most_common(5))

Bag of Words in a Dict
{'Dear': 1, 'All': 1, 'hands': 2, 'meeting': 4, 'Friday': 1, 'April': 1, '26th': 1, '11': 1, '1': 1, 'Pacific': 1, 'focused': 1, 'developing': 1, 'prioritized': 1, 'list': 1, 'research': 2, 'questions': 3, 'summer': 2, 'camp': 1, 'use': 2, 'topic': 1, 'modeling': 1, 'address': 1, 'Please': 2, 'debriefing': 1, 'report': 1, 'includes': 1, 'ongoing': 1, 'modeled': 1, 'tasks': 1, 'Important': 1, 'In': 1, 'advance': 1, 'collection': 1, 'teams': 2, 'interested': 1, 'Google': 1, 'sheet': 3, 'suggest': 1, 'high': 1, 'priority': 1, 'WE1S': 1, 'begin': 1, 'work': 1, 'We': 1, 'discuss': 1, 'prioritize': 1, 'links': 1, 'definition': 1, 'formed': 1, 'question': 1, 'previously': 1, 'suggested': 1, 'new': 1, 'feel': 1, 'important': 1, 'Also': 1, 'context': 1, 'promised': 1, 'outcomes': 1, 'Mellon': 1, 'grant': 1}


All words except stop words
['Dear', 'All', 'hands', 'meeting', 'Friday', 'April', '26th', '11', '1', 'Pacific', 'focused', 'developing', 'prioritized', 'list', 'res

In [23]:
import nltk
from nltk.stem.porter import *  
from nltk.stem.snowball import SnowballStemmer

def stemify(bag, stemmer='porter'):
    stems = {}
    if stemmer == 'snowball':
        stemmer = SnowballStemmer(language='english')
    else:
        stemmer = PorterStemmer()
    for key, value in bag.items():
        stem = stemmer.stem(key)
        if stem not in stems:
            stems[stem] = value
        else:
            stems[stem] = stems[stem] + value
    return stems

# Get a bag of stems from a bag of words
bag = dict(word_freq)
stems = stemify(bag)
stems

{'dear': 1,
 'all': 1,
 'hand': 2,
 'meet': 4,
 'friday': 1,
 'april': 1,
 '26th': 1,
 '11': 1,
 '1': 1,
 'pacif': 1,
 'focus': 1,
 'develop': 1,
 'priorit': 2,
 'list': 1,
 'research': 2,
 'question': 4,
 'summer': 2,
 'camp': 1,
 'use': 2,
 'topic': 1,
 'model': 2,
 'address': 1,
 'pleas': 2,
 'debrief': 1,
 'report': 1,
 'includ': 1,
 'ongo': 1,
 'task': 1,
 'import': 2,
 'In': 1,
 'advanc': 1,
 'collect': 1,
 'team': 2,
 'interest': 1,
 'googl': 1,
 'sheet': 3,
 'suggest': 2,
 'high': 1,
 'prioriti': 1,
 'we1': 1,
 'begin': 1,
 'work': 1,
 'We': 1,
 'discuss': 1,
 'link': 1,
 'definit': 1,
 'form': 1,
 'previous': 1,
 'new': 1,
 'feel': 1,
 'also': 1,
 'context': 1,
 'promis': 1,
 'outcom': 1,
 'mellon': 1,
 'grant': 1}