# Textual analysis of activist campaign information

### MySQL to extract the page name, manually selected tags, and html content of the top mailing for each page

```
SET @rank_pages:=0; SET @rank:=0;
select ranked.page_id, ranked.page_name, tag.name as tag_name, replace(replace(mail.html,',',''),'"','') as html
from
  (select page_id, page_name, mailing_id, count, 
          IF(@rank_pages=page_id,@rank:=@rank+1,@rank:=1) as rank, @rank_pages:=page_id
  from
	(select p.id as page_id, p.name as page_name, a.mailing_id, count(*) as count
	from core_action as a
	join core_page as p on a.page_id = p.id
	where a.mailing_id is not null
	and p.id not in (5,25,28,46,130,525,561,566,761,935,1304,1394,1862,2678,3712,8559,10668)
	and left(p.name,12) <> "controlshift"
	and p.created_at >= "2013-01-01"
	and p.lang_id = 100
	group by p.id, a.mailing_id
	order by p.id, count(*) desc
	) as unranked
  ) as ranked
join core_mailing as mail on mail.id = ranked.mailing_id
join core_page_tags as cpt on cpt.page_id = ranked.page_id
join core_tag as tag on tag.id = cpt.tag_id and tag.id IN (2,8,10,11,13,15,22,23,24,25,29,30,32,33,34,35,36,39,41,43,45,47,48,49,54,59,60,64,67,72,73,75,80,81,82,84,88,89,91,92,93,94,95,96,98,101,104,105,106,107,109,112,114,115,116,117,120,122,123,125,127,130,133,139,141,142,146,148,151,157,160,161,175,177,178,181,183,185,190,193,201,202,206,207,211,213,222,224,226,227,231,234,239,240,242,243,244,246,248,254,258,260,261,265,267,270,273,280,287,288,289,291,297,303,315,316,322,323,325,327,328,334,345,346,347,348,369,383,389,393,394,402,407,410,412,415,443,445,451,452,463,467,468,471,480,481,485,486,488,489,493,508,518,521,549,550,551,564,567,572,573,574,581,583,587,619,621,624,634,641,659,696,804,820,826,898,900,933,934,937,938,940,941,942,943,944,945,946,947,954,966,967,968,969,972,973,974,975,976,977,1000,1012,1036,1046,1071,1078,1128,1130,1132,1140,1208,1248,1282,1739,1746) 
where rank = 1
order by 1,4
```

### import modules

In [12]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import *
from nltk.book import *
from gensim import corpora, models

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### read the csv into a DataFrame

In [2]:
camp_txt = pd.read_csv('../capstone/text_fields.csv')
pd.options.display.max_colwidth = 500
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
1,400,time-warner-al-jazeera,discrimination,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
2,401,hbo-animal-cruelty,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...
3,401,hbo-animal-cruelty,animal abuse,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...


### flatten tags for each campaign into a list, then turn the list into a string; each campaign is now a single row

In [3]:
camp_txt = pd.DataFrame(camp_txt.groupby(by=('page_id','page_name','html'))['tag_name'].apply(list)).reset_index()
camp_txt['tag_name'] = pd.DataFrame(camp_txt['tag_name'].apply(', '.join))
camp_txt = camp_txt[['page_id','page_name','tag_name','html']]
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,"us corporation, discrimination",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
1,401,hbo-animal-cruelty,"us corporation, animal abuse",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...
2,402,gm-strike,"us corporation, working conditions, workers",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life-long injuries.</p>\r\n<p><a href=http://action.sumofus.org/a...
3,403,boeing-dreamliner-fire,"us corporation, consumer safety",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<div>Boeing's new 787 Dreamliners keep catching on fire. Something is clearly wrong with the electrical system.</div>\r\n<div>&nbsp;</div>\r\n<div><strong>Tell Boeing to recall the 787s immediately.</strong></div>\r\n<p...


### use BeautifulSoup to clean up the html

In [4]:
pd.options.display.max_colwidth = 300
dirty = camp_txt['html']
clean = pd.Series()
for index, item in dirty.iteritems():    
    soup = BeautifulSoup(item, "lxml")
    scrubbed = (soup.get_text(strip=True))
    clean.loc[index] = scrubbed 
camp_txt['html'] = clean
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,"us corporation, discrimination",In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.{{ user.first_name|capfirst|default:Friend }}On Wednesday Current TV announced that it had been sold to Al...
1,401,hbo-animal-cruelty,"us corporation, animal abuse",A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shootingLuck.Tell HBO to investigate claims of animal abuse and enact measures to prevent animal cruelty in the future.{{ user.first_name|capfirst|default:Friend }}Information has come to light ofshoc...
2,402,gm-strike,"us corporation, working conditions, workers",This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life-long injuries.Tell General Motors to meet with its injured workers and negotiate.{{ user.first_name|capfirst|default:Friend }}Jorge Parra st...
3,403,boeing-dreamliner-fire,"us corporation, consumer safety",Boeing's new 787 Dreamliners keep catching on fire. Something is clearly wrong with the electrical system.Tell Boeing to recall the 787s immediately.{{ user.first_name|capfirst|default:Friend }}From the startthe Boeing 787 Dreamliner has been plagued with problemsbut now a clear pattern is emerg...


### a bit more cleaning

In [7]:
camp_txt['page_name'] = camp_txt['page_name'].str.replace('[^\w\s]',' ') #replaces all punctuation in page_name with spaces
camp_txt['tag_name'] = camp_txt['tag_name'].str.replace('[^\w\s]',' ') #replaces all punctuation in tag_name with spaces
camp_txt['tag_name'] = camp_txt['tag_name'].str.replace('[_]',' ') #replaces underscores in tag_name with spaces
camp_txt['html'] = camp_txt['html'].str.replace('[^\w\s]',' ') #replaces all punctuation in page_name with spaces
camp_txt['html'] = camp_txt['html'].str.replace("{(.+)}", ' ') #removes django tags from html

# this is my very unsophisticated way of weighing the tag names x4 to the rest of the words
tag_weight = camp_txt['tag_name']+' '+camp_txt['tag_name']+' '+camp_txt['tag_name']+' '+camp_txt['tag_name']

# put it together and lowercase everything
camp_txt['corpus'] = (camp_txt['page_name']+' '+tag_weight+' '+camp_txt['html']).str.lower()
camp_txt.head(5)

Unnamed: 0,page_id,page_name,tag_name,html,corpus,tokens
0,400,time warner al jazeera,us corporation discrimination,In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al ...,time warner al jazeera us corporation discrimination us corporation discrimination us corporat...,"[discrimination, discrimination, discrimination, discrimination, blatantly, prejudiced, cable, d..."
1,401,hbo animal cruelty,us corporation animal abuse,A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of s...,hbo animal cruelty us corporation animal abuse us corporation animal abuse us corporation ani...,"[hbo, animal, cruelty, animal, animal, animal, animal, animal, hbo, led, horses, season, shootin..."
2,402,gm strike,us corporation working conditions workers,This man has stitched his lips together and declared a hunger strike demanding that General Moto...,gm strike us corporation working conditions workers us corporation working conditions worker...,"[strike, stitched, lips, declared, hunger, strike, motors, compensate, colombian, debilitating, ..."
3,403,boeing dreamliner fire,us corporation consumer safety,Boeing s new 787 Dreamliners keep catching on fire Something is clearly wrong with the electric...,boeing dreamliner fire us corporation consumer safety us corporation consumer safety us corpor...,"[dreamliner, consumer, safety, consumer, safety, consumer, safety, consumer, safety, 787, dreaml..."
4,405,anz mining,bank Australian corporation,ANZ Bank promised when signing the Equator Principles to not loan money to projects that have a ...,anz mining bank australian corporation bank australian corporation bank australian corporatio...,"[anz, mining, bank, bank, bank, bank, anz, bank, promised, equator, principles, loan, projects, ..."


In [8]:
#tokenize and remove stopwords
pd.options.display.max_colwidth = 100
stop_words = set(stopwords.words('english'))
camp_txt['tokens'] = camp_txt.apply(lambda row: nltk.word_tokenize(row['corpus']), axis=1)
camp_txt['tokens'] = camp_txt['tokens'].apply(lambda x: [item for item in x if item not in stop_words])
camp_txt[['page_id','page_name','corpus','tokens']]

Unnamed: 0,page_id,page_name,corpus,tokens
0,400,time warner al jazeera,time warner al jazeera us corporation discrimination us corporation discrimination us corporat...,"[discrimination, discrimination, discrimination, discrimination, blatantly, prejudiced, cable, d..."
1,401,hbo animal cruelty,hbo animal cruelty us corporation animal abuse us corporation animal abuse us corporation ani...,"[hbo, animal, cruelty, animal, animal, animal, animal, animal, hbo, led, horses, season, shootin..."
2,402,gm strike,gm strike us corporation working conditions workers us corporation working conditions worker...,"[strike, stitched, lips, declared, hunger, strike, motors, compensate, colombian, debilitating, ..."
3,403,boeing dreamliner fire,boeing dreamliner fire us corporation consumer safety us corporation consumer safety us corpor...,"[dreamliner, consumer, safety, consumer, safety, consumer, safety, consumer, safety, 787, dreaml..."
4,405,anz mining,anz mining bank australian corporation bank australian corporation bank australian corporatio...,"[anz, mining, bank, bank, bank, bank, anz, bank, promised, equator, principles, loan, projects, ..."
5,406,drowning whales,drowning whales canada canada canada canada a pod of whales is drowning in the frozen sea of nor...,"[drowning, whales, pod, whales, drowning, frozen, sea, northern, whales, northa, killer, whales,..."
6,407,anglogold,anglogold africa working conditions workers mining africa working conditions workers minin...,"[anglogold, africa, mining, africa, mining, africa, mining, africa, mining, miners, dying, easil..."
7,408,goldcorp,goldcorp workers mining workers mining workers mining workers mining 13 employees of goldcor...,"[mining, mining, mining, mining, mining, dispute, compensate, absolutely, horrendous, shot, guat..."
8,409,newtown walmart,newtown walmart offline action taker walmart gun control offline action taker walmart gun co...,"[newtown, offline, taker, gun, offline, taker, gun, offline, taker, gun, offline, taker, gun, 11..."
9,410,hyper racism,hyper racism racism racism racism racism an apple accessory manufacturer hyper hired models to s...,"[racism, racism, racism, racism, racism, accessory, manufacturer, hired, models, silently, clad,..."


In [9]:
# create and apply a part of speech filter
token_list = camp_txt['tokens'].sum()
pos_tagged = nltk.pos_tag(token_list)
pos_filter = ['JJ','NN','NNS','RB','VB', 'VBD','VBG','VBN','VBP','VBZ']
filtered_list = [word for word,pos in pos_tagged if pos in pos_filter]

camp_txt['filtered'] =''
for index, row in camp_txt.iterrows():
    row_tokens = camp_txt['tokens'][index]  
    row_filtered = list(set(row_tokens).intersection(filtered_list))
    camp_txt.set_value(index,'filtered', row_filtered)
camp_txt[["page_id","tokens","filtered"]]        

Unnamed: 0,page_id,tokens,filtered
0,400,"[discrimination, discrimination, discrimination, discrimination, blatantly, prejudiced, cable, d...","[reading, prejudice, viewpoint, channel, prejudiced, suggests, broadcasting, cable, discriminati..."
1,401,"[hbo, animal, cruelty, animal, animal, animal, animal, animal, hbo, led, horses, season, shootin...","[reading, cruelty, footage, easily, ofshocking, oversaw, institute, forluckother, luck, thronesr..."
2,402,"[strike, stitched, lips, declared, hunger, strike, motors, compensate, colombian, debilitating, ...","[reading, fairly, reintegrate, media, coworkers, compensate, akerson, pieces, pounds, agreed, ha..."
3,403,"[dreamliner, consumer, safety, consumer, safety, consumer, safety, consumer, safety, 787, dreaml...","[fires, aft, airplanes, catching, airlines, dreamliners, reported, carrier, caught, oxygen, bay,..."
4,405,"[anz, mining, bank, bank, bank, bank, anz, bank, promised, equator, principles, loan, projects, ...","[forest, fake, previous, maules, bank, upwiping, amounts, abiding, creek, environmental, promise..."
5,406,"[drowning, whales, pod, whales, drowning, frozen, sea, northern, whales, northa, killer, whales,...","[appears, frozen, turns, rapidly, residents, ecosystem, fragile, creating, ship, arctic, dorob, ..."
6,407,"[anglogold, africa, mining, africa, mining, africa, mining, africa, mining, miners, dying, easil...","[refuse, easily, scarring, amazingly, havealways, filed, compensate, yourob, stages, incorporate..."
7,408,"[mining, mining, mining, mining, mining, dispute, compensate, absolutely, horrendous, shot, guat...","[intimidating, violently, compensate, ton, notices, covered, dispute, justices, valuable, events..."
8,409,"[newtown, offline, taker, gun, offline, taker, gun, offline, taker, gun, offline, taker, gun, 11...","[delivery, hookmembers, momsrising, gun, ctrsvp, addition, capacity, joining, anthony, driving, ..."
9,410,"[racism, racism, racism, racism, racism, accessory, manufacturer, hired, models, silently, clad,...","[models, staffing, amazingly, blatant, theypainted, ces14, fully, hired, vice, whitein, usual, a..."


In [10]:
# stem the filtered tokens
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

camp_txt['stemmed'] =''
for index, row in camp_txt.iterrows():
    to_stem = camp_txt['tokens'][index]
    for stemmer in (snowball, lancaster, porter):
        stemmed_text = [stemmer.stem(t) for t in to_stem]
        camp_txt.set_value(index,'stemmed', stemmed_text)
camp_txt[["page_id","filtered","stemmed"]] 

Unnamed: 0,page_id,filtered,stemmed
0,400,"[reading, prejudice, viewpoint, channel, prejudiced, suggests, broadcasting, cable, discriminati...","[discrimin, discrimin, discrimin, discrimin, blatantli, prejud, cabl, drop, cabl, owner, shake, ..."
1,401,"[reading, cruelty, footage, easily, ofshocking, oversaw, institute, forluckother, luck, thronesr...","[hbo, anim, cruelti, anim, anim, anim, anim, anim, hbo, led, hors, season, shootingluck, hbo, in..."
2,402,"[reading, fairly, reintegrate, media, coworkers, compensate, akerson, pieces, pounds, agreed, ha...","[strike, stitch, lip, declar, hunger, strike, motor, compens, colombian, debilit, injuri, motor,..."
3,403,"[fires, aft, airplanes, catching, airlines, dreamliners, reported, carrier, caught, oxygen, bay,...","[dreamlin, consum, safeti, consum, safeti, consum, safeti, consum, safeti, 787, dreamlin, catch,..."
4,405,"[forest, fake, previous, maules, bank, upwiping, amounts, abiding, creek, environmental, promise...","[anz, mine, bank, bank, bank, bank, anz, bank, promis, equat, principl, loan, project, neg, envi..."
5,406,"[appears, frozen, turns, rapidly, residents, ecosystem, fragile, creating, ship, arctic, dorob, ...","[drown, whale, pod, whale, drown, frozen, sea, northern, whale, northa, killer, whale, drowningu..."
6,407,"[refuse, easily, scarring, amazingly, havealways, filed, compensate, yourob, stages, incorporate...","[anglogold, africa, mine, africa, mine, africa, mine, africa, mine, miner, die, easili, prevent,..."
7,408,"[intimidating, violently, compensate, ton, notices, covered, dispute, justices, valuable, events...","[mine, mine, mine, mine, mine, disput, compens, absolut, horrend, shot, guatemala, ton, detail, ..."
8,409,"[delivery, hookmembers, momsrising, gun, ctrsvp, addition, capacity, joining, anthony, driving, ...","[newtown, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, 110000..."
9,410,"[models, staffing, amazingly, blatant, theypainted, ces14, fully, hired, vice, whitein, usual, a...","[racism, racism, racism, racism, racism, accessori, manufactur, hire, model, silent, clad, bikin..."


In [13]:
#this adds a word freq distribution to each campaign, to help me judge how well a topic fits to the text
camp_txt['freq'] =''
for index, row in camp_txt.iterrows():
    camp_freq = camp_txt['stemmed'][index]
    dist = FreqDist(camp_freq)
    camp_txt.set_value(index,'freq', dist.most_common())
camp_txt[["page_id","stemmed","freq"]]         

Unnamed: 0,page_id,stemmed,freq
0,400,"[discrimin, discrimin, discrimin, discrimin, blatantli, prejud, cabl, drop, cabl, owner, shake, ...","[(cabl, 6), (discrimin, 4), (prejud, 2), (drop, 2), (tv, 2), (middl, 1), (viewpoint, 1), (uniqu,..."
1,401,"[hbo, anim, cruelti, anim, anim, anim, anim, anim, hbo, led, hors, season, shootingluck, hbo, in...","[(anim, 13), (hbo, 11), (hors, 8), (product, 3), (cruelti, 3), (alleg, 3), (investig, 3), (cover..."
2,402,"[strike, stitch, lip, declar, hunger, strike, motor, compens, colombian, debilit, injuri, motor,...","[(injur, 6), (auto, 5), (strike, 5), (negoti, 4), (colombian, 4), (hunger, 4), (parra, 3), (moto..."
3,403,"[dreamlin, consum, safeti, consum, safeti, consum, safeti, consum, safeti, 787, dreamlin, catch,...","[(787, 6), (dreamlin, 5), (electr, 5), (consum, 4), (safeti, 4), (plane, 3), (recal, 2), (proble..."
4,405,"[anz, mine, bank, bank, bank, bank, anz, bank, promis, equat, principl, loan, project, neg, envi...","[(bank, 7), (anz, 7), (loan, 6), (coal, 5), (climat, 3), (promis, 3), (environment, 3), (princip..."
5,406,"[drown, whale, pod, whale, drown, frozen, sea, northern, whale, northa, killer, whale, drowningu...","[(whale, 8), (ice, 4), (sea, 3), (pod, 2), (drown, 2), (ship, 2), (inukjuak, 2), (drowningund, 1..."
6,407,"[anglogold, africa, mine, africa, mine, africa, mine, africa, mine, miner, die, easili, prevent,...","[(mine, 8), (gold, 7), (africa, 7), (ashanti, 6), (anglogold, 5), (protect, 4), (prevent, 3), (e..."
7,408,"[mine, mine, mine, mine, mine, disput, compens, absolut, horrend, shot, guatemala, ton, detail, ...","[(mine, 6), (compens, 2), (shoot, 2), (valuabl, 1), (horrend, 1), (forc, 1), (intimid, 1), (even..."
8,409,"[newtown, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, 110000...","[(gun, 4), (weapon, 4), (taker, 4), (offlin, 4), (survivor, 3), (assault, 3), (newtown, 2), (dan..."
9,410,"[racism, racism, racism, racism, racism, accessori, manufactur, hire, model, silent, clad, bikin...","[(racism, 7), (model, 4), (babe, 3), (paint, 3), (clad, 3), (accessori, 3), (electron, 3), (cons..."


In [14]:
#Get most frequent words for all campaigns.  Use this to identify and add additional stop words.
freq = camp_txt['stemmed'].sum()
fdist1 = FreqDist(freq)
fdist1.most_common(500)

[('environ', 2928),
 ('oil', 2891),
 ('food', 2509),
 ('trade', 2016),
 ('bee', 1847),
 ('gmo', 1587),
 ('tpp', 1554),
 ('water', 1419),
 ('privat', 1342),
 ('palm', 1316),
 ('tax', 1309),
 ('protect', 1291),
 ('anim', 1291),
 ('industri', 1190),
 ('climat', 1175),
 ('health', 1037),
 ('pesticid', 1009),
 ('sharehold', 972),
 ('consum', 940),
 ('bank', 909),
 ('econom', 867),
 ('wage', 809),
 ('meddl', 806),
 ('media', 783),
 ('women', 764),
 ('liberti', 717),
 ('frack', 714),
 ('pipelin', 699),
 ('ttip', 665),
 ('farmer', 657),
 ('suppli', 651),
 ('chemic', 637),
 ('environment', 619),
 ('safeti', 604),
 ('invest', 586),
 ('secret', 586),
 ('mine', 585),
 ('energi', 577),
 ('factori', 549),
 ('spill', 529),
 ('toxic', 518),
 ('clean', 515),
 ('lobbi', 508),
 ('organ', 496),
 ('custom', 494),
 ('govern', 486),
 ('neonic', 482),
 ('cancer', 476),
 ('children', 474),
 ('respons', 452),
 ('regul', 450),
 ('fuel', 440),
 ('forest', 423),
 ('threaten', 419),
 ('lgbt', 415),
 ('amazon', 413)

In [15]:
#turns the tokens series into a corpora.Dictionary object, then 
#dictionary = corpora.Dictionary(camp_list)

camp_corpus = camp_txt['stemmed'].tolist()
dictionary = corpora.Dictionary(camp_corpus)
corpus = [dictionary.doc2bow(text) for text in camp_corpus]

In [20]:
lda = models.ldamodel.LdaModel(corpus, num_topics=15, id2word = dictionary, passes=50)
all_tops = lda.print_topics(num_topics=15,num_words=2)

In [21]:
all_tops

[(0, '0.024*anim + 0.010*park'),
 (1, '0.087*bee + 0.043*pesticid'),
 (2, '0.028*oil + 0.023*pipelin'),
 (3, '0.025*sharehold + 0.022*wage'),
 (4, '0.037*climat + 0.021*environ'),
 (5, '0.052*oil + 0.051*water'),
 (6, '0.026*food + 0.023*factori'),
 (7, '0.038*privat + 0.022*nh'),
 (8, '0.055*food + 0.047*gmo'),
 (9, '0.026*internet + 0.019*amazon'),
 (10, '0.020*protect + 0.016*consum'),
 (11, '0.062*trade + 0.044*tpp'),
 (12, '0.031*bank + 0.021*mine'),
 (13, '0.058*tax + 0.033*frack'),
 (14, '0.014*women + 0.012*lgbt')]

In [None]:
#NOW ASSIGN THE TOPICS TO THE CAMPAIGNS

In [24]:
camp_txt['topics'] = " "
for index, row in camp_txt.iterrows():
    t = camp_txt.iloc[index]['stemmed']
    doc_bow = dictionary.doc2bow(t)
    camp_tops = lda.get_document_topics(doc_bow,0.35)
    camp_txt.set_value(index,'topics', camp_tops)

In [25]:
camp_txt['topic_name'] = ""
for index, row in camp_txt.iterrows():
    topics_per_row = camp_txt['topics'][index]
    camp_topic_num = [item[0] for item in topics_per_row]
    temp_list = []
    for num in camp_topic_num:
        for i in all_tops:
            if num == i[0]:
                temp_list.append(i)
                #print(index, num, temp_list)
    camp_txt.set_value(index,'topic_name', temp_list) 
camp_txt[["page_name","freq","topics","topic_name"]][800:900]

Unnamed: 0,page_name,freq,topics,topic_name
800,ABC budget cuts,"[(abc, 15), (cut, 5), (media, 4), (account, 4), (privat, 4), (meddl, 4), (broken, 3), (australia...","[(11, 0.465943782753)]","[(11, 0.062*trade + 0.044*tpp)]"
801,GP fee,"[(tax, 7), (ill, 5), (poor, 5), (healthcar, 5), (visit, 4), (econom, 4), (doctor, 4), (chronic, ...","[(7, 0.388698274945)]","[(7, 0.038*privat + 0.022*nh)]"
802,business handouts,"[(australian, 5), (ordinari, 5), (budget, 5), (econom, 4), (handout, 4), (term, 2), (tough, 2), ...",[],[]
803,corporate welfare,"[(australian, 6), (tax, 5), (econom, 4), (slash, 4), (cut, 4), (ordinari, 4), (ill, 3), (handout...","[(13, 0.439563962347)]","[(13, 0.058*tax + 0.033*frack)]"
804,hockey budget party,"[(budget, 9), (hockey, 9), (joe, 7), (econom, 4), (secret, 4), (cut, 4), (spend, 4), (donor, 4),...",[],[]
805,clean energy,"[(energi, 10), (clean, 7), (arena, 4), (environ, 4), (axe, 3), (climat, 3), (renew, 3), (abolish...","[(4, 0.399934085608), (10, 0.381875081568)]","[(4, 0.037*climat + 0.021*environ), (10, 0.020*protect + 0.016*consum)]"
806,apple prizes corporate excess not factory conditions,"[(factori, 10), (ahrendt, 3), (kunpeng, 3), (poison, 3), (safeti, 3), (excess, 3), (workplac, 3)...","[(6, 0.64826063057)]","[(6, 0.026*food + 0.023*factori)]"
807,cancer causing chemicals on apples,"[(appl, 10), (fda, 7), (chemic, 7), (regul, 5), (food, 5), (consum, 4), (gmo, 4), (protect, 4), ...","[(8, 0.447080463007), (10, 0.442116315892)]","[(8, 0.055*food + 0.047*gmo), (10, 0.020*protect + 0.016*consum)]"
808,monsanto sues vermonts,"[(gmo, 13), (label, 9), (food, 9), (requir, 3), (term, 2), (eat, 2), (sharehold, 2), (start, 2),...","[(8, 0.880868767813)]","[(8, 0.055*food + 0.047*gmo)]"
809,stand with pizza express,"[(anim, 8), (pizza, 7), (muslim, 6), (express, 6), (halal, 4), (racism, 4), (outrag, 4), (fear, ...","[(0, 0.497366492554)]","[(0, 0.024*anim + 0.010*park)]"
