### Flow of data preparation:
- extract a text stream from (raw) data
- clean up texts through stopword removal and lemmatization
- break sanitized text into words and collocations (features)

In [1]:
# Load in the data from CSV

import pandas

allComments = pandas.read_csv("./data/comments-labelled.csv")
allComments.head()

Unnamed: 0.1,Unnamed: 0,document_id,tracking_number,date_posted,retrieved,has_attachments,comment,document_url,ID,Sentiment,labelled_by,pos,neg
0,0,DOI-2017-0002-0002,1k1-8wbs-ucnh,2017-05-11,2017-05-27 01:43:49.443154,False,Our national monuments are a national treasure...,https://www.regulations.gov/document?D=DOI-201...,,,,0,0
1,1,DOI-2017-0002-0003,1k1-8wbs-1cws,2017-05-11,2017-05-26 21:35:25.550530,False,1.We do not want National Monument protection ...,https://www.regulations.gov/document?D=DOI-201...,,,,0,0
2,2,DOI-2017-0002-0004,1k1-8wbs-oj39,2017-05-11,2017-05-30 10:14:25.162305,False,The monuments must be preserved. the precedent...,https://www.regulations.gov/document?D=DOI-201...,,,,0,0
3,3,DOI-2017-0002-0005,1k1-8wbs-9rjp,2017-05-11,2017-05-30 10:14:31.861017,False,My name is Ryan Erik Benally and I'm from Mont...,https://www.regulations.gov/document?D=DOI-201...,,,,0,0
4,4,DOI-2017-0002-0006,1k1-8wbs-umhr,2017-05-11,2017-05-27 04:10:25.339717,False,all protections and preservations for the enti...,https://www.regulations.gov/document?D=DOI-201...,,,,0,0


In [2]:
# Leave out unused columns 

allComments.drop(allComments.columns[[0,1,2,3,4,5,7,8,11,12]], axis=1, inplace=True) 
allComments.head()

Unnamed: 0,comment,Sentiment,labelled_by
0,Our national monuments are a national treasure...,,
1,1.We do not want National Monument protection ...,,
2,The monuments must be preserved. the precedent...,,
3,My name is Ryan Erik Benally and I'm from Mont...,,
4,all protections and preservations for the enti...,,


In [3]:
# Split text into sentences
# Ref: https://stackoverflow.com/a/31505798

import re

caps = "([A-Z])"
digits = "([0-9])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences if len(s.strip())>1]
    return sentences


sentences = allComments['comment'].map(lambda x: split_into_sentences(x))

In [4]:
# Build a sentence-frequency dictionary from text

from collections import defaultdict

frequency = defaultdict(int)
for sent_list in sentences:
    for sent in sent_list:
        frequency[sent] += 1

uniqueSentences = list(frequency.keys())
print("Number of unique sentences:", len(uniqueSentences))

Number of unique sentences: 508294


In [5]:
# (Optional) Sort the sentence-frequency dictionary by frequency and write to CSV

import operator
import csv

sorted_frequency_list = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
print("Top 10 common sentences:")
print(sorted_frequency_list[:10])

with open('sentence_frequency.csv', 'w') as csv_output:
    fieldnames = ['frequency', 'sentence']
    writer = csv.DictWriter(csv_output, fieldnames=fieldnames)
    writer.writeheader()
    for item in sorted_frequency_list:
        writer.writerow({'frequency': item[1], 'sentence': item[0]})
   

Top 10 common sentences:
[('He and all fifteen subsequent presidents--of both parties--have recognized the need and value of protecting these public lands with the Antiquities Act.', 23034), ('These monuments are a legacy of Teddy Roosevelt.', 23033), ('Reversing any of these designations would be a tragic mistake with harmful consequences for all that depend on our magnificent public lands.', 23027), ("I urge you to uphold Roosevelt's legacy and maintain these monuments for current and future generations.", 22996), ('The national monuments created in the past twenty years have protected vital bird habitat, helped safeguard our heritage, and benefited communities across the country.', 22950), ('From the buttes of Bears Ears that support birds like the Golden Eagle, to the underwater canyons of the Northeast Canyons and Seamounts National Monument that support a critical ecosystem for Atlantic Puffins, to the rocky peaks of the Organ Mountains-Desert Peaks National Monument, the shrub-s

In [6]:
# Tokenize the corpus (all unique sentences) and create a token stream for training a collocation detector

import gensim

def tokenize(text, minimumLength=3):
    return gensim.utils.simple_preprocess(text, deacc=True, min_len=minimumLength)
    
tokenized_sentences = []
for line in uniqueSentences:
    tokenized_sentences.append(tokenize(line, 3))

In [7]:
# Train a collocation detector

from gensim.models import Phrases

bigram = Phrases(tokenized_sentences, min_count=1, threshold=2)
test = tokenize("Our national monuments are a national treasure for all to enjoy", 3)
bigram[test]



['our',
 'national_monuments',
 'are',
 'national_treasure',
 'for',
 'all',
 'enjoy']

In [8]:
# Preprocess text through stopword removal, collocation detection and lemmatization

from gensim.utils import lemmatize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

def preprocess_text(tokenized_text):
    
    text = [[word for word in line if word not in stopwords] for line in tokenized_text]
    text = [bigram[line] for line in text]
    
    lemmatizer = WordNetLemmatizer()
    text = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in text]
    return text


preprocess_text(tokenized_sentences[:3])



[['keep', 'lands', 'public', 'shrink', 'size', 'listed', 'monuments'],
 ['david_brower',
  'regretted',
  'decision',
  'compromise',
  'move',
  'dam',
  'location',
  'sierra_club',
  'lost',
  'president',
  'politic',
  'land',
  'preservation',
  'changed'],
 ['particular',
  'regularly_visited',
  'san_gabriel',
  'mountains_california',
  'resident']]

In [9]:
# Tokenize and preprocess individual comments 

def text_to_token(text):
    tokens = [tokenize(sent) for sent in split_into_sentences(text)]
    flat_list = [item for sublist in preprocess_text(tokens) for item in sublist]
    return flat_list
        
allComments['token'] = allComments['comment'].map(lambda text: text_to_token(text))
allComments.head()



Unnamed: 0,comment,Sentiment,labelled_by,token
0,Our national monuments are a national treasure...,,,"[national_monuments, national_treasure, enjoy,..."
1,1.We do not want National Monument protection ...,,,"[want, national_monument, protection_removed, ..."
2,The monuments must be preserved. the precedent...,,,"[monuments, must_preserved, precedent_removing..."
3,My name is Ryan Erik Benally and I'm from Mont...,,,"[name, ryan, erik, benally, montezuma_creek, u..."
4,all protections and preservations for the enti...,,,"[protections, preservations, entire_country, a..."


In [10]:
# Finalize the corpus and dictionary for topic modeling 

from gensim.corpora import Dictionary
from gensim import corpora

dictionary = Dictionary(allComments.token)
dictionary.compactify()
corpus = [dictionary.doc2bow(comment) for comment in allComments.token]

corpora.MmCorpus.serialize('./data/monument.mm', corpus)  # store the corpus to disk
dictionary.save('./data/monument.dict')  # store the dictionary

print(dictionary)

Dictionary(139698 unique tokens: ['favoribly', 'customs', 'number_plant', 'grasslands_wildlife', 'either_proposition']...)


In [15]:
# For training the classifer, output labelled instances as training data

allComments['token'] = allComments['token'].map(lambda x: " ".join(x))
labelledComments = allComments.dropna()
labelledComments.to_csv('./data/comments-for-train.csv', index=False)

print(labelledComments.shape)
labelledComments.sample(5)

(71663, 4)


Unnamed: 0,comment,Sentiment,labelled_by,token
52131,I am writing you in support of our national mo...,Positive,template,writing_support national_monuments comment rev...
11021,"Dear Secretary Ryan Zinke,\n\nAs a supporter o...",Positive,template,dear_secretary ryan_zinke supporter_bird conse...
76034,"Dear Secretary Ryan Zinke,\nBears Ears Nationa...",Positive,template,dear_secretary ryan_zinke bears_ears national_...
85226,Our national monuments and public lands and wa...,Positive,template,national_monuments public_lands waters_help de...
31573,Katahdin Woods National Monument is a gem. How...,Positive,hand,katahdin_woods national_monument gem even_cons...


In [17]:
# Prepare unlabelled comments for our classifier to label 

unlabelledComments = allComments[allComments['Sentiment'].notnull() == False]
unlabelledComments = unlabelledComments[unlabelledComments.token != '']
unlabelledComments.labelled_by = 'classifier'
unlabelledComments.to_csv('./data/comments-to-label.csv', index=False) 

print(unlabelledComments.shape)
unlabelledComments.sample(5)

(74646, 4)


Unnamed: 0,comment,Sentiment,labelled_by,token
63469,STOP! STOP! STOP! This is just another example...,,classifier,stop stop stop another_example greed public_op...
121238,Please don't carelessly remove the designation...,,classifier,please carelessly remove designation lands are...
64976,Trump is obsessed with killing every accomplis...,,classifier,trump obsessed killing every accomplishment pr...
25214,The fact that this 'review' is even happening ...,,classifier,fact review even_happening demonstrates short_...
100975,"Grand Staircase, Bears Ears and other national...",,classifier,grand_staircase bears_ears national_monuments ...
