Combine the dataset of classified observations with the dataset of named entities.

Also in this notebook: code used to extract comments for manual validation

In [87]:
import pandas as pd

comments = pd.read_csv('data/comments.csv')
labels = pd.read_csv('data/comments-labelled-complete.csv', usecols=['document_id', 'Sentiment', 'labelled_by'])
ne = pd.read_csv('data/named-entities-grouped.csv')
entity_types = pd.read_csv('data/entity-type-lookup.csv')
relabelled = pd.read_csv('data/relabelled.csv')

In [143]:
print('Dataset Counts')
for d in ['comments', 'labels', 'ne', 'entity_types']:
    print(d, ': ', str(len(globals()[d])))


comments :  151631
labels :  146309
ne :  146363
entity_types :  205


In [41]:
labels.head(2)

Unnamed: 0,document_id,Sentiment,labelled_by
0,DOI-2017-0002-0019,Positive,template
1,DOI-2017-0002-0036,Positive,template


In [35]:
ne.head(2)

Unnamed: 0,document_id,Basin And Range,Bears Ears National Monument,Berryessa Snow Mountain,Browns Canyon National Monument,Craters Of The Moon National Monument,Canyons Of The Ancients National Monument,Carrizo Plain National Monument,Cascade Siskiyou National Monument,Gold Butte,...,Ute Mountain Ute,Wabanaki,Washington,Western,Wyoming,Yellowstone,Yosemite,Zion,Zion National Park,Zuni
0,DOI-2017-0002-0002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DOI-2017-0002-0003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
relabelled.head(2)

Unnamed: 0,document_id,date_posted,comment,Predicted Sentiment,Sentiment
0,DOI-2017-0002-0894,5/12/17,Please reverse the executive order by the Obam...,Positive,Negative
1,DOI-2017-0002-104418,5/26/17,Please reduce the size or reverse the decision...,Positive,Negative


In [142]:
# update labels in the labels dataset based on the relabelled items
print('Before: ', labels['Sentiment'].value_counts())
for i, row in relabelled.iterrows():
    labels.loc[labels['document_id']==row['document_id'], ['Sentiment', 'labelled_by']] = [row['Sentiment'], 'hand']
print('After: ', labels['Sentiment'].value_counts())

Before:  Positive    142671
Negative      3414
Neutral        224
Name: Sentiment, dtype: int64
After:  Positive    142671
Negative      3414
Neutral        224
Name: Sentiment, dtype: int64


In [99]:
df = pd.merge(comments, labels, how='left', on='document_id')
df = pd.merge(df, ne, how='left', on='document_id')

In [100]:
df.head(5)

Unnamed: 0,document_id,tracking_number,date_posted,retrieved,has_attachments,comment,document_url,Sentiment,labelled_by,Basin And Range,...,Ute Mountain Ute,Wabanaki,Washington,Western,Wyoming,Yellowstone,Yosemite,Zion,Zion National Park,Zuni
0,DOI-2017-0002-0002,1k1-8wbs-ucnh,2017-05-11,2017-05-27 01:43:49.443154,False,Our national monuments are a national treasure...,https://www.regulations.gov/document?D=DOI-201...,Positive,classifier,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DOI-2017-0002-0003,1k1-8wbs-1cws,2017-05-11,2017-05-26 21:35:25.550530,False,1.We do not want National Monument protection ...,https://www.regulations.gov/document?D=DOI-201...,Positive,classifier,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DOI-2017-0002-0004,1k1-8wbs-oj39,2017-05-11,2017-05-30 10:14:25.162305,False,The monuments must be preserved. the precedent...,https://www.regulations.gov/document?D=DOI-201...,Positive,classifier,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DOI-2017-0002-0005,1k1-8wbs-9rjp,2017-05-11,2017-05-30 10:14:31.861017,False,My name is Ryan Erik Benally and I'm from Mont...,https://www.regulations.gov/document?D=DOI-201...,Positive,classifier,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DOI-2017-0002-0006,1k1-8wbs-umhr,2017-05-11,2017-05-27 04:10:25.339717,False,all protections and preservations for the enti...,https://www.regulations.gov/document?D=DOI-201...,Positive,classifier,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
# manually search for some mis-labelled comments

import re
patterns = [r'please\s(rescind|reduce|rescind|revert|undo|reverse|shrink)',
    r'president trump',
    r'(?<!president )obama',
    r'landgrab',
    r'land grab', 
    r'1\.\d{0,2}\sm',
    r'(?<!not )(support|favor|encourage|agree with)\srescind']
pattern = r'(' + r'|'.join(patterns) + r')'
prog = re.compile(pattern, re.IGNORECASE)

matches = [ob['document_id'] for idx, ob in df.iterrows() if prog.search(ob['comment'])]
match_set = set(matches)

In [141]:
inspect_list = df.loc[df['document_id'].isin(match_set)]
inspect_list = inspect_list[inspect_list['Sentiment'] == 'Positive']
inspect_list.to_csv('inspect_list4.csv', index=False, columns=['document_id', 'comment', 'Sentiment', 'labelled_by'])