In [122]:
# Required imports
import nltk
import pandas
from nltk.corpus import stopwords
from six import iteritems
from gensim import corpora

In [123]:
# Load in the data from CSV
allComments = pandas.read_csv("./data/comments.csv")

In [124]:
# What did we read in?
allComments.head()


Unnamed: 0,document_id,tracking_number,date_posted,retrieved,has_attachments,comment,document_url
0,DOI-2017-0002-0002,1k1-8wbs-ucnh,2017-05-11,2017-05-27 01:43:49.443154,False,Our national monuments are a national treasure...,https://www.regulations.gov/document?D=DOI-201...
1,DOI-2017-0002-0003,1k1-8wbs-1cws,2017-05-11,2017-05-26 21:35:25.550530,False,1.We do not want National Monument protection ...,https://www.regulations.gov/document?D=DOI-201...
2,DOI-2017-0002-0004,1k1-8wbs-oj39,2017-05-11,2017-05-30 10:14:25.162305,False,The monuments must be preserved. the precedent...,https://www.regulations.gov/document?D=DOI-201...
3,DOI-2017-0002-0005,1k1-8wbs-9rjp,2017-05-11,2017-05-30 10:14:31.861017,False,My name is Ryan Erik Benally and I'm from Mont...,https://www.regulations.gov/document?D=DOI-201...
4,DOI-2017-0002-0006,1k1-8wbs-umhr,2017-05-11,2017-05-27 04:10:25.339717,False,all protections and preservations for the enti...,https://www.regulations.gov/document?D=DOI-201...


In [125]:
# Convert comment field to string.
allComments['comment'].astype('str')
allComments.dtypes

document_id        object
tracking_number    object
date_posted        object
retrieved          object
has_attachments      bool
comment            object
document_url       object
dtype: object

In [126]:
# Sample (scalable development)
someComments = allComments.sample(n=10)
# someComments = allComments
someComments.head()

Unnamed: 0,document_id,tracking_number,date_posted,retrieved,has_attachments,comment,document_url
137686,DOI-2017-0002-87304,1k1-8wl4-6wcj,2017-05-25,2017-05-27 08:18:56.524964,False,"Dear Secretary Ryan Zinke,\nBears Ears Nationa...",https://www.regulations.gov/document?D=DOI-201...
34840,DOI-2017-0002-131487,1k1-8wqr-8s1y,2017-06-03,2017-06-07 20:41:17.417558,False,Please all empowered to do so protect our Nati...,https://www.regulations.gov/document?D=DOI-201...
45928,DOI-2017-0002-14353,1k1-8wen-nggi,2017-05-16,2017-05-27 04:12:16.929094,False,Administrations in the past have done their be...,https://www.regulations.gov/document?D=DOI-201...
41399,DOI-2017-0002-137903,1k1-8wts-d8i5,2017-06-07,2017-06-07 19:15:40.769942,False,"Dear Ryan Zinke,\n\nOur national monuments and...",https://www.regulations.gov/document?D=DOI-201...
114065,DOI-2017-0002-65813,1k1-8wj8-wios,2017-05-23,2017-05-27 13:19:15.538264,False,"Dear Secretary Ryan Zinke,\n\nAs a supporter o...",https://www.regulations.gov/document?D=DOI-201...


In [127]:
# First, tokenize
tokenized_someComments = someComments.comment.apply(nltk.word_tokenize)
tokenized_someComments.head()

137686    [Dear, Secretary, Ryan, Zinke, ,, Bears, Ears,...
34840     [Please, all, empowered, to, do, so, protect, ...
45928     [Administrations, in, the, past, have, done, t...
41399     [Dear, Ryan, Zinke, ,, Our, national, monument...
114065    [Dear, Secretary, Ryan, Zinke, ,, As, a, suppo...
Name: comment, dtype: object

In [128]:
# Let's start with a small stopword list
stop = set(('and', 'or', 'not'))
tokenized_someComments = tokenized_someComments.apply(lambda x: [item.lower() for item in x if item not in stop and len(item) > 1])
tokenized_someComments.head()

137686    [dear, secretary, ryan, zinke, bears, ears, na...
34840     [please, all, empowered, to, do, so, protect, ...
45928     [administrations, in, the, past, have, done, t...
41399     [dear, ryan, zinke, our, national, monuments, ...
114065    [dear, secretary, ryan, zinke, as, supporter, ...
Name: comment, dtype: object

In [129]:
# Stemming
porterStemmer = nltk.stem.porter.PorterStemmer()
stemmed_someComments = tokenized_someComments.apply(lambda x: [porterStemmer.stem(x) for x in x])
stemmed_someComments.head()

137686    [dear, secretari, ryan, zink, bear, ear, natio...
34840     [pleas, all, empow, to, do, so, protect, our, ...
45928     [administr, in, the, past, have, done, their, ...
41399     [dear, ryan, zink, our, nation, monument, publ...
114065    [dear, secretari, ryan, zink, as, support, of,...
Name: comment, dtype: object

In [130]:
# Lemmatization
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_someComments = tokenized_someComments.apply(lambda x: [wordnet_lemmatizer.lemmatize(x) for x in x])
lemmatized_someComments.head()

137686    [dear, secretary, ryan, zinke, bear, ear, nati...
34840     [please, all, empowered, to, do, so, protect, ...
45928     [administration, in, the, past, have, done, th...
41399     [dear, ryan, zinke, our, national, monument, p...
114065    [dear, secretary, ryan, zinke, a, supporter, o...
Name: comment, dtype: object