In [187]:
# Required imports
import nltk
import pandas
from nltk.corpus import stopwords
from six import iteritems
from gensim import corpora
import pprint

In [221]:
#VARIABLES
SAMPLE_SIZE = 1000


In [223]:
# Load in the data from CSV
allComments = pandas.read_csv("./data/comments.csv")

In [224]:
# What did we read in?
allComments.head()


Unnamed: 0,document_id,tracking_number,date_posted,retrieved,has_attachments,comment,document_url
0,DOI-2017-0002-0002,1k1-8wbs-ucnh,2017-05-11,2017-05-27 01:43:49.443154,False,Our national monuments are a national treasure...,https://www.regulations.gov/document?D=DOI-201...
1,DOI-2017-0002-0003,1k1-8wbs-1cws,2017-05-11,2017-05-26 21:35:25.550530,False,1.We do not want National Monument protection ...,https://www.regulations.gov/document?D=DOI-201...
2,DOI-2017-0002-0004,1k1-8wbs-oj39,2017-05-11,2017-05-30 10:14:25.162305,False,The monuments must be preserved. the precedent...,https://www.regulations.gov/document?D=DOI-201...
3,DOI-2017-0002-0005,1k1-8wbs-9rjp,2017-05-11,2017-05-30 10:14:31.861017,False,My name is Ryan Erik Benally and I'm from Mont...,https://www.regulations.gov/document?D=DOI-201...
4,DOI-2017-0002-0006,1k1-8wbs-umhr,2017-05-11,2017-05-27 04:10:25.339717,False,all protections and preservations for the enti...,https://www.regulations.gov/document?D=DOI-201...


In [225]:
# Convert comment field to string.
allComments['comment'].astype('str')
allComments.dtypes

document_id        object
tracking_number    object
date_posted        object
retrieved          object
has_attachments      bool
comment            object
document_url       object
dtype: object

In [226]:
# Sample (scalable development)
someComments = allComments.sample(n=SAMPLE_SIZE)
# someComments = allComments
someComments.head()

Unnamed: 0,document_id,tracking_number,date_posted,retrieved,has_attachments,comment,document_url
69323,DOI-2017-0002-25109,1k1-8wfa-9t7u,2017-05-17,2017-05-26 21:55:47.096718,False,"I am a native of Colorado, born and raised to ...",https://www.regulations.gov/document?D=DOI-201...
17061,DOI-2017-0002-114588,1k1-8wlz-wwmm,2017-05-27,2017-05-30 18:20:16.859381,False,National lands belong to all of us. They are A...,https://www.regulations.gov/document?D=DOI-201...
9938,DOI-2017-0002-108084,1k1-8wlr-h1ge,2017-05-26,2017-05-30 11:28:27.125953,False,Please rescind Bears Ears National Monument!,https://www.regulations.gov/document?D=DOI-201...
85830,DOI-2017-0002-40126,1k1-8wgd-1hzi,2017-05-18,2017-05-26 22:45:33.874416,False,"Dear Secretary Ryan Zinke,\n\nI travel a lot f...",https://www.regulations.gov/document?D=DOI-201...
22802,DOI-2017-0002-119827,1k1-8wmq-rmod,2017-05-28,2017-05-30 19:17:00.509387,False,I am writing you in support of our national mo...,https://www.regulations.gov/document?D=DOI-201...


In [227]:
# First, tokenize
tokenized_someComments = someComments.comment.apply(nltk.word_tokenize)
tokenized_someComments.head()

69323    [I, am, a, native, of, Colorado, ,, born, and,...
17061    [National, lands, belong, to, all, of, us, ., ...
9938     [Please, rescind, Bears, Ears, National, Monum...
85830    [Dear, Secretary, Ryan, Zinke, ,, I, travel, a...
22802    [I, am, writing, you, in, support, of, our, na...
Name: comment, dtype: object

In [236]:
# Let's start with a small stopword list
stop = [];
stop.extend(stopwords.words('english'))
stop.extend(["to", "from", "'s"])

# using NLTK stopword list
print("We removed a total of " + str(len(stop)) + " stopwords: " + str(stop))

tokenized_someComments = tokenized_someComments.apply(lambda x: [item.lower() for item in x if item not in stop and len(item) > 1])
tokenized_someComments.head()

We removed a total of 156 stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',

69323    [native, colorado, born, raised, love, outdoor...
17061    [national, lands, belong, us, america, greates...
9938     [please, rescind, bears, ears, national, monum...
85830    [dear, secretary, ryan, zinke, travel, lot, wo...
22802    [writing, support, national, monuments, commen...
Name: comment, dtype: object

In [238]:
# Stemming
porterStemmer = nltk.stem.porter.PorterStemmer()
stemmed_someComments = tokenized_someComments.apply(lambda x: [porterStemmer.stem(x) for x in x])
stemmed_someComments.head()

69323    [nativ, colorado, born, rais, love, outdoor, w...
17061    [nation, land, belong, us, america, greatest, ...
9938         [pleas, rescind, bear, ear, nation, monument]
85830    [dear, secretari, ryan, zink, travel, lot, wor...
22802    [write, support, nation, monument, comment, re...
Name: comment, dtype: object

In [239]:
# Lemmatization
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_someComments = tokenized_someComments.apply(lambda x: [wordnet_lemmatizer.lemmatize(x) for x in x])
lemmatized_someComments.head()

69323    [native, colorado, born, raised, love, outdoor...
17061    [national, land, belong, u, america, greatest,...
9938      [please, rescind, bear, ear, national, monument]
85830    [dear, secretary, ryan, zinke, travel, lot, wo...
22802    [writing, support, national, monument, comment...
Name: comment, dtype: object

In [240]:
# Stem the lemmatized tokens
stemmed_and_lemmatized_someComments = lemmatized_someComments.apply(lambda x: [porterStemmer.stem(x) for x in x])
stemmed_and_lemmatized_someComments.head()

69323    [nativ, colorado, born, rais, love, outdoor, w...
17061    [nation, land, belong, u, america, greatest, i...
9938         [pleas, rescind, bear, ear, nation, monument]
85830    [dear, secretari, ryan, zink, travel, lot, wor...
22802    [write, support, nation, monument, comment, re...
Name: comment, dtype: object

In [241]:
# Get a frequency count using Counter (a really cool beefed up Hashmap data structure!)
c = Counter()

# Build the counter
for row in lemmatized_someComments.iteritems():
    for word in row[1]:
        c[word] += 1

In [243]:
pp = pprint.PrettyPrinter(width=20, compact=True)
pp.pprint(c.most_common(100))


[('monument', 3722),
 ('national', 3107),
 ('land', 1833),
 ('public', 1296),
 ('water', 667),
 ('support', 665),
 ('bear', 658),
 ('ear', 654),
 ('bird', 633),
 ('protection',
  574),
 ('zinke', 572),
 ('secretary', 564),
 ('dear', 534),
 ('protected', 533),
 ('urge', 512),
 ('designation',
  495),
 ('ryan', 489),
 ('country', 471),
 ('would', 462),
 ('habitat', 452),
 ('canyon', 435),
 ('legacy', 434),
 ('president', 431),
 ('future', 424),
 ('american', 423),
 ('please', 419),
 ('protect', 415),
 ('community', 405),
 ('act', 403),
 ('roosevelt', 403),
 ('--', 397),
 ('place', 384),
 ('important', 377),
 ('generation',
  375),
 ('review', 367),
 ('heritage', 364),
 ('year', 361),
 ('peak', 352),
 ('landscape', 350),
 ('natural', 342),
 ('antiquity', 335),
 ('sincerely', 321),
 ('need', 314),
 ('park', 310),
 ('change', 302),
 ('like', 301),
 ('nation', 296),
 ('area', 294),
 ('strongly', 293),
 ('value', 287),
 ('wildlife', 284),
 ('one', 262),
 ('current', 260),
 ('across', 252),
 (