# Textual analysis of activist campaign information

### MySQL to extract the page name, manually selected tags, and html content of the top mailing for each page

```
SET @rank_pages:=0; SET @rank:=0;
select ranked.page_id, ranked.page_name, tag.name as tag_name, replace(replace(mail.html,',',''),'"','') as html
from
  (select page_id, page_name, mailing_id, count, 
          IF(@rank_pages=page_id,@rank:=@rank+1,@rank:=1) as rank, @rank_pages:=page_id
  from
	(select p.id as page_id, p.name as page_name, a.mailing_id, count(*) as count
	from core_action as a
	join core_page as p on a.page_id = p.id
	where a.mailing_id is not null
	and p.id not in (5,25,28,46,130,525,561,566,761,935,1304,1394,1862,2678,3712,8559,10668)
	and left(p.name,12) <> "controlshift"
	and p.created_at >= "2013-01-01"
	and p.lang_id = 100
	group by p.id, a.mailing_id
	order by p.id, count(*) desc
	) as unranked
  ) as ranked
join core_mailing as mail on mail.id = ranked.mailing_id
join core_page_tags as cpt on cpt.page_id = ranked.page_id
join core_tag as tag on tag.id = cpt.tag_id and tag.id IN (2,8,10,11,13,15,22,23,24,25,29,30,32,33,34,35,36,39,41,43,45,47,48,49,54,59,60,64,67,72,73,75,80,81,82,84,88,89,91,92,93,94,95,96,98,101,104,105,106,107,109,112,114,115,116,117,120,122,123,125,127,130,133,139,141,142,146,148,151,157,160,161,175,177,178,181,183,185,190,193,201,202,206,207,211,213,222,224,226,227,231,234,239,240,242,243,244,246,248,254,258,260,261,265,267,270,273,280,287,288,289,291,297,303,315,316,322,323,325,327,328,334,345,346,347,348,369,383,389,393,394,402,407,410,412,415,443,445,451,452,463,467,468,471,480,481,485,486,488,489,493,508,518,521,549,550,551,564,567,572,573,574,581,583,587,619,621,624,634,641,659,696,804,820,826,898,900,933,934,937,938,940,941,942,943,944,945,946,947,954,966,967,968,969,972,973,974,975,976,977,1000,1012,1036,1046,1071,1078,1128,1130,1132,1140,1208,1248,1282,1739,1746) 
where rank = 1
order by 1,4
```

### import modules

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import *
from nltk.book import *
from gensim import corpora, models

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### read the csv into a DataFrame

In [2]:
camp_txt = pd.read_csv('../capstone/text_fields.csv')
pd.options.display.max_colwidth = 500
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
1,400,time-warner-al-jazeera,discrimination,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
2,401,hbo-animal-cruelty,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...
3,401,hbo-animal-cruelty,animal abuse,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...


### flatten tags for each campaign into a list, then turn the list into a string; each campaign is now a single row

In [3]:
camp_txt = pd.DataFrame(camp_txt.groupby(by=('page_id','page_name','html'))['tag_name'].apply(list)).reset_index()
camp_txt['tag_name'] = pd.DataFrame(camp_txt['tag_name'].apply(', '.join))
camp_txt = camp_txt[['page_id','page_name','tag_name','html']]
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,"us corporation, discrimination",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.</p>\r\n<p><strong>Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.<br /><...
1,401,hbo-animal-cruelty,"us corporation, animal abuse",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shooting <em>Luck</em>.</p>\r\n<p><strong>Tell&nbsp; HBO to investigate claims of animal abuse and enact measures to prev...
2,402,gm-strike,"us corporation, working conditions, workers",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<p>This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life-long injuries.</p>\r\n<p><a href=http://action.sumofus.org/a...
3,403,boeing-dreamliner-fire,"us corporation, consumer safety",<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 cellpadding=0 bgcolor=#ffffff>\r\n<tbody>\r\n<tr>\r\n<td style=padding: 10px;>\r\n<div>Boeing's new 787 Dreamliners keep catching on fire. Something is clearly wrong with the electrical system.</div>\r\n<div>&nbsp;</div>\r\n<div><strong>Tell Boeing to recall the 787s immediately.</strong></div>\r\n<p...


### use BeautifulSoup to clean up the html

In [4]:
pd.options.display.max_colwidth = 300
dirty = camp_txt['html']
clean = pd.Series()
for index, item in dirty.iteritems():    
    soup = BeautifulSoup(item, "lxml")
    scrubbed = (soup.get_text(strip=True))
    clean.loc[index] = scrubbed 
camp_txt['html'] = clean
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,"us corporation, discrimination",In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera.Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake.{{ user.first_name|capfirst|default:Friend }}On Wednesday Current TV announced that it had been sold to Al...
1,401,hbo-animal-cruelty,"us corporation, animal abuse",A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shootingLuck.Tell HBO to investigate claims of animal abuse and enact measures to prevent animal cruelty in the future.{{ user.first_name|capfirst|default:Friend }}Information has come to light ofshoc...
2,402,gm-strike,"us corporation, working conditions, workers",This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life-long injuries.Tell General Motors to meet with its injured workers and negotiate.{{ user.first_name|capfirst|default:Friend }}Jorge Parra st...
3,403,boeing-dreamliner-fire,"us corporation, consumer safety",Boeing's new 787 Dreamliners keep catching on fire. Something is clearly wrong with the electrical system.Tell Boeing to recall the 787s immediately.{{ user.first_name|capfirst|default:Friend }}From the startthe Boeing 787 Dreamliner has been plagued with problemsbut now a clear pattern is emerg...


### a bit more cleaning

In [5]:
camp_txt['page_name'] = camp_txt['page_name'].str.replace('[^\w\s]',' ') #replaces all punctuation in page_name with spaces
camp_txt['tag_name'] = camp_txt['tag_name'].str.replace('[^\w\s]',' ') #replaces all punctuation in tag_name with spaces
camp_txt['tag_name'] = camp_txt['tag_name'].str.replace('[_]',' ') #replaces underscores in tag_name with spaces
camp_txt['html'] = camp_txt['html'].str.replace('[^\w\s]',' ') #replaces all punctuation in page_name with spaces
camp_txt['html'] = camp_txt['html'].str.replace("{(.+)}", ' ') #removes django tags from html

# this is my very unsophisticated way of weighing the tag names x4 to the rest of the words
tag_weight = camp_txt['tag_name']+' '+camp_txt['tag_name']+' '+camp_txt['tag_name']+' '+camp_txt['tag_name']

# put it together and lowercase everything
camp_txt['corpus'] = (camp_txt['page_name']+' '+tag_weight+' '+camp_txt['html']).str.lower()
camp_txt.head(5)

Unnamed: 0,page_id,page_name,tag_name,html,corpus
0,400,time warner al jazeera,us corporation discrimination,In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the moment it was sold to Al Jazeera Tell Time Warner Cable to pick CurrentTV back up and give its new owners a fair shake user first_name capfirst default Friend On Wednesday Current TV announced that it had been sold to Al...,time warner al jazeera us corporation discrimination us corporation discrimination us corporation discrimination us corporation discrimination in a blatantly prejudiced move time warner cable dropped currenttv the moment it was sold to al jazeera tell time warner cable to pick currenttv back...
1,401,hbo animal cruelty,us corporation animal abuse,A new lawsuit claims that animal abuse by HBO led to the death of four horses in one season of shootingLuck Tell HBO to investigate claims of animal abuse and enact measures to prevent animal cruelty in the future user first_name capfirst default Friend Information has come to light ofshoc...,hbo animal cruelty us corporation animal abuse us corporation animal abuse us corporation animal abuse us corporation animal abuse a new lawsuit claims that animal abuse by hbo led to the death of four horses in one season of shootingluck tell hbo to investigate claims of animal abuse and e...
2,402,gm strike,us corporation working conditions workers,This man has stitched his lips together and declared a hunger strike demanding that General Motors compensate its Colombian employees for debilitating life long injuries Tell General Motors to meet with its injured workers and negotiate user first_name capfirst default Friend Jorge Parra st...,gm strike us corporation working conditions workers us corporation working conditions workers us corporation working conditions workers us corporation working conditions workers this man has stitched his lips together and declared a hunger strike demanding that general motors compensate ...
3,403,boeing dreamliner fire,us corporation consumer safety,Boeing s new 787 Dreamliners keep catching on fire Something is clearly wrong with the electrical system Tell Boeing to recall the 787s immediately user first_name capfirst default Friend From the startthe Boeing 787 Dreamliner has been plagued with problemsbut now a clear pattern is emerg...,boeing dreamliner fire us corporation consumer safety us corporation consumer safety us corporation consumer safety us corporation consumer safety boeing s new 787 dreamliners keep catching on fire something is clearly wrong with the electrical system tell boeing to recall the 787s immediat...
4,405,anz mining,bank Australian corporation,ANZ Bank promised when signing the Equator Principles to not loan money to projects that have a negative impact on the environment But it gave a 1 2B loan to a massive new coal mine in NSW Tell ANZ to overturn the loan and keep its promises to protect the environment user first_name capfirs...,anz mining bank australian corporation bank australian corporation bank australian corporation bank australian corporation anz bank promised when signing the equator principles to not loan money to projects that have a negative impact on the environment but it gave a 1 2b loan to a massive...


In [None]:
#tokenize and remove stopwords
pd.options.display.max_colwidth = 100
stop_words = set(stopwords.words('english'))
camp_txt['tokens'] = camp_txt.apply(lambda row: nltk.word_tokenize(row['corpus']), axis=1)
camp_txt['tokens'] = camp_txt['tokens'].apply(lambda x: [item for item in x if item not in stop_words])
camp_txt[['page_id','page_name','corpus','tokens']].head(10)

In [None]:
# create and apply a part of speech filter
token_list = camp_txt['tokens'].sum()
pos_tagged = nltk.pos_tag(token_list)
pos_filter = ['JJ','NN','NNS','RB','VB', 'VBD','VBG','VBN','VBP','VBZ']
filtered_list = [word for word,pos in pos_tagged if pos in pos_filter]

camp_txt['filtered'] =''
for index, row in camp_txt.iterrows():
    row_tokens = camp_txt['tokens'][index]  
    row_filtered = list(set(row_tokens).intersection(filtered_list))
    camp_txt.set_value(index,'filtered', row_filtered)
camp_txt[["page_id","tokens","filtered"]].head(10)        

In [None]:
# stem the filtered tokens
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

camp_txt['stemmed'] =''
for index, row in camp_txt.iterrows():
    to_stem = camp_txt['tokens'][index]
    for stemmer in (snowball, lancaster, porter):
        stemmed_text = [stemmer.stem(t) for t in to_stem]
        camp_txt.set_value(index,'stemmed', stemmed_text)
camp_txt[["page_id","filtered","stemmed"]].head(10) 

Unnamed: 0,page_id,filtered,stemmed
0,400,"[cable, unnoticed, evening, prejudice, announcement, reverse, middle, owners, hailed, atlantic, ...","[discrimin, discrimin, discrimin, discrimin, blatantli, prejud, cabl, drop, cabl, owner, shake, ..."
1,401,"[cover, hollywood, drugged, allegations, occurs, programs, involve, cruelty, luckwas, stemming, ...","[anim, cruelti, anim, anim, anim, anim, anim, led, hors, season, shootingluck, investig, anim, e..."
2,402,"[hunger, detroit, protects, jobs, confront, pieces, compensation, walking, consider, hands, work...","[strike, stitch, lip, declar, hunger, strike, motor, compens, colombian, debilit, injuri, motor,..."
3,403,"[earlier, pattern, boston, safety, fuel, defects, air, delivered, severity, problemsbut, recall,...","[dreamlin, consum, safeti, consum, safeti, consum, safeti, consum, safeti, 787, dreamlin, catch,..."
4,405,"[vulnerable, leard, mention, emissions, negative, forest, hectares, environment, agricultural, o...","[anz, mine, bank, bank, bank, bank, anz, bank, promis, equat, principl, loan, project, neg, envi..."
5,406,"[ecosystem, earlier, pod, whales, rapidly, air, magnificent, fragile, sheet, increased, appears,...","[drown, whale, pod, whale, drown, frozen, sea, northern, whale, northa, killer, whale, drowningu..."
6,407,"[migrant, joined, firms, class, dying, safety, ashanti, havealways, permanent, filed, incorporat...","[anglogold, africa, mine, africa, mine, africa, mine, africa, mine, miner, die, easili, prevent,..."
7,408,"[horrendous, forces, guatemala, shooting, series, justices, mining, intimidating, shot, things, ...","[mine, mine, mine, mine, mine, disput, compens, absolut, horrend, shot, guatemala, ton, detail, ..."
8,409,"[cover, friendly, weapons, eventbut, driving, dealer, ourfacebook, forces, assault, simon, event...","[newtown, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, 110000..."
9,410,"[whitein, senior, traffic, unbelievably, hired, theypainted, racist, woman, critics, babes, fit,...","[racism, racism, racism, racism, racism, accessori, manufactur, hire, model, silent, clad, bikin..."


In [26]:
#this adds a word freq distribution to each campaign, to help me judge how well a topic fits to the text
camp_txt['freq'] =''
for index, row in camp_txt.iterrows():
    camp_freq = camp_txt['stemmed'][index]
    dist = FreqDist(camp_freq)
    camp_txt.set_value(index,'freq', dist.most_common())
camp_txt[["page_id","stemmed","freq"]].head(10)         

Unnamed: 0,page_id,stemmed,freq
0,400,"[discrimin, discrimin, discrimin, discrimin, blatantli, prejud, cabl, drop, cabl, owner, shake, ...","[(cabl, 6), (discrimin, 4), (drop, 2), (prejud, 2), (tv, 2), (suggest, 1), (announc, 1), (wire, ..."
1,401,"[anim, cruelti, anim, anim, anim, anim, anim, led, hors, season, shootingluck, investig, anim, e...","[(anim, 13), (hors, 8), (cruelti, 3), (investig, 3), (alleg, 3), (product, 3), (cover, 2), (meas..."
2,402,"[strike, stitch, lip, declar, hunger, strike, motor, compens, colombian, debilit, injuri, motor,...","[(injur, 6), (auto, 5), (strike, 5), (hunger, 4), (negoti, 4), (colombian, 4), (parra, 3), (inju..."
3,403,"[dreamlin, consum, safeti, consum, safeti, consum, safeti, consum, safeti, 787, dreamlin, catch,...","[(787, 6), (dreamlin, 5), (electr, 5), (safeti, 4), (consum, 4), (plane, 3), (earlier, 2), (clea..."
4,405,"[anz, mine, bank, bank, bank, bank, anz, bank, promis, equat, principl, loan, project, neg, envi...","[(anz, 7), (bank, 7), (loan, 6), (coal, 5), (promis, 3), (environment, 3), (environ, 3), (climat..."
5,406,"[drown, whale, pod, whale, drown, frozen, sea, northern, whale, northa, killer, whale, drowningu...","[(whale, 8), (ice, 4), (sea, 3), (pod, 2), (inukjuak, 2), (ship, 2), (drown, 2), (ecosystem, 1),..."
6,407,"[anglogold, africa, mine, africa, mine, africa, mine, africa, mine, miner, die, easili, prevent,...","[(mine, 8), (africa, 7), (gold, 7), (ashanti, 6), (anglogold, 5), (protect, 4), (prevent, 3), (m..."
7,408,"[mine, mine, mine, mine, mine, disput, compens, absolut, horrend, shot, guatemala, ton, detail, ...","[(mine, 6), (compens, 2), (shoot, 2), (cover, 1), (thing, 1), (coverag, 1), (horrend, 1), (event..."
8,409,"[newtown, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, offlin, taker, gun, 110000...","[(gun, 4), (taker, 4), (offlin, 4), (weapon, 4), (assault, 3), (survivor, 3), (join, 2), (sandi,..."
9,410,"[racism, racism, racism, racism, racism, accessori, manufactur, hire, model, silent, clad, bikin...","[(racism, 7), (model, 4), (babe, 3), (electron, 3), (accessori, 3), (clad, 3), (consum, 3), (pai..."


In [25]:
#Get most frequent words for all campaigns.  Use this to identify and add additional stop words.
freq = camp_txt['stemmed'].sum()
fdist1 = FreqDist(freq)
fdist1.most_common(50)

[('environ', 2928),
 ('oil', 2891),
 ('food', 2509),
 ('trade', 2016),
 ('bee', 1847),
 ('gmo', 1587),
 ('tpp', 1554),
 ('water', 1419),
 ('privat', 1342),
 ('palm', 1316),
 ('tax', 1309),
 ('anim', 1291),
 ('protect', 1291),
 ('industri', 1190),
 ('climat', 1175),
 ('health', 1037),
 ('pesticid', 1009),
 ('sharehold', 972),
 ('consum', 940),
 ('bank', 909),
 ('econom', 867),
 ('wage', 809),
 ('meddl', 806),
 ('media', 783),
 ('women', 764),
 ('liberti', 717),
 ('frack', 714),
 ('pipelin', 699),
 ('ttip', 665),
 ('farmer', 657),
 ('suppli', 651),
 ('chemic', 637),
 ('environment', 619),
 ('safeti', 604),
 ('secret', 586),
 ('invest', 586),
 ('mine', 585),
 ('energi', 577),
 ('factori', 549),
 ('spill', 529),
 ('toxic', 518),
 ('clean', 515),
 ('lobbi', 508),
 ('organ', 496),
 ('custom', 494),
 ('govern', 486),
 ('neonic', 482),
 ('cancer', 476),
 ('children', 474),
 ('respons', 452)]

In [11]:
#turns the tokens series into a corpora.Dictionary object, then 
#dictionary = corpora.Dictionary(camp_list)

camp_corpus = camp_txt['stemmed'].tolist()
dictionary = corpora.Dictionary(camp_corpus)
corpus = [dictionary.doc2bow(text) for text in camp_corpus]

In [12]:
lda = models.ldamodel.LdaModel(corpus, num_topics=15, id2word = dictionary, passes=50)
all_tops = lda.print_topics(num_topics=15,num_words=2)

In [13]:
all_tops

[(0, '0.043*privat + 0.028*health'),
 (1, '0.040*anim + 0.010*liberti'),
 (2, '0.040*food + 0.038*gmo'),
 (3, '0.025*lgbt + 0.013*gun'),
 (4, '0.019*women + 0.019*sharehold'),
 (5, '0.073*bank + 0.017*econom'),
 (6, '0.007*turnbul + 0.007*samsung'),
 (7, '0.042*trade + 0.041*ttip'),
 (8, '0.023*emiss + 0.022*suppli'),
 (9, '0.039*oil + 0.038*environ'),
 (10, '0.072*water + 0.027*environ'),
 (11, '0.012*miner + 0.010*aid'),
 (12, '0.024*wage + 0.017*factori'),
 (13, '0.043*bee + 0.036*food'),
 (14, '0.049*tax + 0.042*tpp')]

In [14]:
#NOW ASSIGN THE TOPICS TO THE CAMPAIGNS

In [15]:
camp_txt['topics'] = " "
for index, row in camp_txt.iterrows():
    t = camp_txt.iloc[index]['stemmed']
    doc_bow = dictionary.doc2bow(t)
    camp_tops = lda.get_document_topics(doc_bow,0.35)
    camp_txt.set_value(index,'topics', camp_tops)

In [24]:
camp_txt['topic_name'] = ""
for index, row in camp_txt.iterrows():
    topics_per_row = camp_txt['topics'][index]
    camp_topic_num = [item[0] for item in topics_per_row]
    temp_list = []
    for num in camp_topic_num:
        for i in all_tops:
            if num == i[0]:
                temp_list.append(i)
                #print(index, num, temp_list)
    camp_txt.set_value(index,'topic_name', temp_list) 
camp_txt[["page_name","freq","topics","topic_name"]][80:90]

Unnamed: 0,page_name,freq,topics,topic_name
80,daily mail newspaper ad,"[(luci, 10), (dacr, 5), (hate, 5), (meadow, 5), (stori, 5), (media, 5), (billboard, 4), (paper, ...","[(4, 0.3543024273)]","[(4, 0.019*women + 0.019*sharehold)]"
81,suncor spill,"[(spill, 15), (river, 6), (answer, 5), (athabasca, 5), (2011, 4), (question, 4), (environ, 4), (...","[(9, 0.613851754205)]","[(9, 0.039*oil + 0.038*environ)]"
82,fcc lobbyist,"[(fcc, 7), (industri, 6), (lobbyist, 4), (chair, 3), (regul, 3), (broadband, 2), (appoint, 2), (...","[(12, 0.528929183325)]","[(12, 0.024*wage + 0.017*factori)]"
83,mondelez unions,"[(union, 9), (recogn, 6), (egyptian, 4), (trident, 3), (order, 3), (refus, 3), (cadburi, 3), (or...","[(5, 0.531977660978)]","[(5, 0.073*bank + 0.017*econom)]"
84,suncor spill post,"[(spill, 15), (river, 6), (answer, 5), (athabasca, 5), (2011, 4), (question, 4), (environ, 4), (...","[(9, 0.61385337986)]","[(9, 0.039*oil + 0.038*environ)]"
85,turbotax,"[(tax, 19), (turbotax, 11), (file, 7), (lobbi, 4), (consum, 4), (protect, 4), (return, 3), (lobb...","[(3, 0.365748793258), (14, 0.425212082347)]","[(3, 0.025*lgbt + 0.013*gun), (14, 0.049*tax + 0.042*tpp)]"
86,united climate lobbying,"[(airlin, 14), (climat, 12), (lobbi, 9), (pollut, 5), (carbon, 4), (swap, 4), (regul, 4), (emiss...","[(9, 0.359494118896)]","[(9, 0.039*oil + 0.038*environ)]"
87,exxon spill,"[(spill, 12), (oil, 11), (exxonmobil, 10), (pipelin, 10), (crude, 9), (arkansa, 8), (toxic, 5), ...","[(9, 0.741440277073)]","[(9, 0.039*oil + 0.038*environ)]"
88,bangladesh arrival,"[(bangladesh, 9), (factori, 9), (tazreen, 6), (sumi, 5), (garment, 5), (bangladeshi, 4), (kalpon...","[(12, 0.761358762184)]","[(12, 0.024*wage + 0.017*factori)]"
89,rapid response,"[(factori, 9), (bangladesh, 8), (tazreen, 6), (sumi, 5), (garment, 5), (bangladeshi, 4), (kalpon...","[(12, 0.760602498973)]","[(12, 0.024*wage + 0.017*factori)]"


In [22]:
# exploratory data viz
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
all_tops

[(0, '0.043*privat + 0.028*health'),
 (1, '0.040*anim + 0.010*liberti'),
 (2, '0.040*food + 0.038*gmo'),
 (3, '0.025*lgbt + 0.013*gun'),
 (4, '0.019*women + 0.019*sharehold'),
 (5, '0.073*bank + 0.017*econom'),
 (6, '0.007*turnbul + 0.007*samsung'),
 (7, '0.042*trade + 0.041*ttip'),
 (8, '0.023*emiss + 0.022*suppli'),
 (9, '0.039*oil + 0.038*environ'),
 (10, '0.072*water + 0.027*environ'),
 (11, '0.012*miner + 0.010*aid'),
 (12, '0.024*wage + 0.017*factori'),
 (13, '0.043*bee + 0.036*food'),
 (14, '0.049*tax + 0.042*tpp')]

In [23]:
pyLDAvis.display(vis)