# Textual analysis of activist campaign information

### MySQL to extract the page name, manually selected tags, and html content of the top mailing for each page

```
SET @rank_pages:=0; SET @rank:=0;
select ranked.page_id, ranked.page_name, tag.name as tag_name, replace(replace(mail.html,',',''),'"','') as html
from
  (select page_id, page_name, mailing_id, count, 
          IF(@rank_pages=page_id,@rank:=@rank+1,@rank:=1) as rank, @rank_pages:=page_id
  from
	(select p.id as page_id, p.name as page_name, a.mailing_id, count(*) as count
	from core_action as a
	join core_page as p on a.page_id = p.id
	where a.mailing_id is not null
	and p.id not in (5,25,28,46,130,525,561,566,761,935,1304,1394,1862,2678,3712,8559,10668)
	and left(p.name,12) <> "controlshift"
	and p.created_at >= "2013-01-01"
	and p.lang_id = 100
	group by p.id, a.mailing_id
	order by p.id, count(*) desc
	) as unranked
  ) as ranked
join core_mailing as mail on mail.id = ranked.mailing_id
join core_page_tags as cpt on cpt.page_id = ranked.page_id
join core_tag as tag on tag.id = cpt.tag_id and tag.id IN (2,8,10,11,13,15,22,23,24,25,29,30,32,33,34,35,36,39,41,43,45,47,48,49,54,59,60,64,67,72,73,75,80,81,82,84,88,89,91,92,93,94,95,96,98,101,104,105,106,107,109,112,114,115,116,117,120,122,123,125,127,130,133,139,141,142,146,148,151,157,160,161,175,177,178,181,183,185,190,193,201,202,206,207,211,213,222,224,226,227,231,234,239,240,242,243,244,246,248,254,258,260,261,265,267,270,273,280,287,288,289,291,297,303,315,316,322,323,325,327,328,334,345,346,347,348,369,383,389,393,394,402,407,410,412,415,443,445,451,452,463,467,468,471,480,481,485,486,488,489,493,508,518,521,549,550,551,564,567,572,573,574,581,583,587,619,621,624,634,641,659,696,804,820,826,898,900,933,934,937,938,940,941,942,943,944,945,946,947,954,966,967,968,969,972,973,974,975,976,977,1000,1012,1036,1046,1071,1078,1128,1130,1132,1140,1208,1248,1282,1739,1746) 
where rank = 1
order by 1,4
```

### import modules

In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import *
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### read the csv into a DataFrame

In [3]:
camp_txt = pd.read_csv('../capstone/text_fields.csv')
pd.options.display.max_colwidth = 500
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
1,400,time-warner-al-jazeera,discrimination,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
2,401,hbo-animal-cruelty,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...
3,401,hbo-animal-cruelty,animal abuse,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...


### flatten tags for each campaign into a list, then turn the list into a string; each campaign is now a single row

In [4]:
camp_txt = pd.DataFrame(camp_txt.groupby(by=('page_id','page_name','html'))['tag_name'].apply(list)).reset_index()
camp_txt['tag_name'] = pd.DataFrame(camp_txt['tag_name'].apply(', '.join))
camp_txt = camp_txt[['page_id','page_name','tag_name','html']]
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,"us corporation, discrimination",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
1,401,hbo-animal-cruelty,"us corporation, animal abuse",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...
2,402,gm-strike,"us corporation, working conditions, workers",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life-long injuries.</p>\r\n<p><a href=http://action.sumofus.org/a...
3,403,boeing-dreamliner-fire,"us corporation, consumer safety",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<div>Boeing's new 787 Dreamliners keep catching on fire. Something is clearly wrong with the electrical system.</div>\r\n<div>&nbsp;</div>\r\n<div><strong>Tell Boeing to recall the 787s immediately.</strong></div>\r\n<p...


### use BeautifulSoup to clean up the html

In [5]:
pd.options.display.max_colwidth = 300
dirty = camp_txt['html']
clean = pd.Series()
for index, item in dirty.iteritems():    
    soup = BeautifulSoup(item, "lxml")
    scrubbed = (soup.get_text(strip=True))
    clean.loc[index] = scrubbed 
camp_txt['html'] = clean
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,"us corporation, discrimination",In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.{{ user.first_name|capfirst|default:Friend }}On Wednesday Current TV announced that it had been sold to Al...
1,401,hbo-animal-cruelty,"us corporation, animal abuse",A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shootingLuck.Tell HBO to investigate claims of animal abuse and enact measures to prevent animal cruelty in the future.{{ user.first_name|capfirst|default:Friend }}Information has come to light ofshoc...
2,402,gm-strike,"us corporation, working conditions, workers",This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life-long injuries.Tell General Motors to meet with its injured workers and negotiate.{{ user.first_name|capfirst|default:Friend }}Jorge Parra st...
3,403,boeing-dreamliner-fire,"us corporation, consumer safety",Boeing's new 787 Dreamliners keep catching on fire. Something is clearly wrong with the electrical system.Tell Boeing to recall the 787s immediately.{{ user.first_name|capfirst|default:Friend }}From the startthe Boeing 787 Dreamliner has been plagued with problemsbut now a clear pattern is emerg...


### a bit more cleaning

In [25]:
camp_txt['page_name'] = camp_txt['page_name'].str.replace('[^\w\s]',' ') #replaces all punctuation in page_name with spaces
camp_txt['tag_name'] = camp_txt['tag_name'].str.replace('[^\w\s]','') #replaces all punctuation in tag_name with empty string
camp_txt['tag_name'] = camp_txt['tag_name'].str.replace('[_]',' ') #replaces underscores in tag_name with spaces
#camp_txt['html'] = camp_txt['html'].str.replace('[^\w\s]',' ') #replaces all punctuation in page_name with spaces
camp_txt['html'] = camp_txt['html'].str.replace("{(.+)}", ' ') #removes django tags from html
camp_txt['corpus'] = camp_txt['page_name']+' '+camp_txt['tag_name']+' '+camp_txt['html'] # add it all together
camp_txt.drop(['tag_name','html'], axis=1, inplace=True) #remove redundant columns
camp_txt

KeyError: 'tag_name'

In [24]:
#tokenize and remove stopwords
pd.options.display.max_colwidth = 100
stop_words = set(stopwords.words('english'))
camp_txt['tokens'] = camp_txt.apply(lambda row: nltk.word_tokenize(row['corpus']), axis=1)
camp_txt['tokens'] = camp_txt['tokens'].apply(lambda x: [item for item in x if item not in stop_words])
camp_txt.head(4)

AttributeError: 'Series' object has no attribute 'translate'

In [8]:
#Get a list of the most frequent words.  Use this to identify and add additional stop words, iterate w/ step above
token_list = camp_txt['tokens'].sum()
fdist1 = FreqDist(token_list)
fdist1.most_common(25)

[("'s", 5658),
 ('--', 5593),
 ('workers', 1964),
 ('oil', 1701),
 ('government', 1645),
 ("n't", 1543),
 ('Monsanto', 1188),
 ('public', 1179),
 ('pay', 1035),
 ('food', 909),
 ('water', 896),
 ("'", 864),
 ('trade', 860),
 ('climate', 847),
 ('palm', 841),
 ('rights', 791),
 ('tax', 779),
 ('UK', 688),
 ('Rights', 686),
 ('US', 682),
 ('TPP', 671),
 ('Environment', 631),
 ('health', 619),
 ('work', 596),
 ('global', 592)]

In [22]:
pos_tuple = nltk.pos_tag(token_list)
nouns_tuple = [word for word,pos in pos_tuple if pos == 'NN']
print(nouns_tuple)



In [18]:
from gensim import corpora, models

camp_list = camp_txt['tokens'].tolist()
camp_list

dictionary = corpora.Dictionary(camp_list)
corpus = [dictionary.doc2bow(text) for text in camp_list]
#print(corpus[0])

ldamodel = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=20,num_words=3))

[(0, "0.014*-- + 0.013*'s + 0.004*Rio"), (1, "0.020*Monsanto + 0.019*tax + 0.015*'s"), (2, "0.021*oil + 0.017*'s + 0.015*--"), (3, '0.011*-- + 0.008*fracking + 0.006*SeaWorld'), (4, '0.028*workers + 0.012*-- + 0.011*pay'), (5, '0.024*water + 0.013*Nestlé + 0.012*--'), (6, "0.013*'s + 0.012*climate + 0.009*--"), (7, '0.022*trade + 0.015*TPP + 0.012*--'), (8, "0.018*-- + 0.010*government + 0.008*'s"), (9, '0.014*-- + 0.008*Facebook + 0.008*police')]
