In [1]:
#import modules
import pandas as pd
import numpy as np

####  Add something about how the campaigns were picked here

In [2]:
#read in the CSV file generated by SQL
camp_txt = pd.read_csv('../capstone/text_fields.csv')
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,us corporation,<table cellspacing=0 cellpadding=0 align=right...
1,400,time-warner-al-jazeera,discrimination,<table cellspacing=0 cellpadding=0 align=right...
2,401,hbo-animal-cruelty,us corporation,<table cellspacing=0 cellpadding=0 align=right...
3,401,hbo-animal-cruelty,animal abuse,<table cellspacing=0 cellpadding=0 align=right...


#### Notice that the structure above has one row for each tag. Campaigns with more than one tag are duplicated. In order to analyze the text, we want each campaign on one and only one row. This means flattening those tags.

In [87]:
flat = pd.DataFrame(camp_txt)

def flatten_frame(df,col):  #df= DataFrame, col= the column to be flattened; in our case, 'tag_name'
    headers = list(df.columns.values) #pull in the list of columns
    group_cols = list(set(headers) - set([col])) #get the cols to group on by subtracting the one we are flattening
    df = pd.DataFrame(df.groupby(by=(group_cols))[col].apply(list)).reset_index() #group and reset index
    df[col] = df[col].apply(', '.join) #convert the flattened col of tags into a string
    return df

flat = flatten_frame(flat,'tag_name')
flat.head(4)

Unnamed: 0,page_name,html,page_id,tag_name
0,-NZ-Labour-TPPA,<table style=width: 240px;...,10354,Trans-Pacific Partnership ...
1,13-first-nations-land-defe...,<table cellspacing=0 cellp...,888,"us corporation, canada, la..."
2,136-un-members-recognise-p...,<div style=width: 320px; f...,15964,#Human_Rights_and_Civil_Li...
3,2013-achievements,<table cellspacing=0 cellp...,961,"donation, recurring"


#### Now each campaign is on a single row, but we have to extract plain text from that ugly HTML/Django.  Luckily, a module called 'Beautiful Soup' will come to our rescue. While I am it, I'll take out all the punctuation from both the html and tag_name fields; we will need that done before the next stage of the analysis.


In [91]:
from bs4 import BeautifulSoup
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def clean_soup(df,old_col,new_col): # df=DataFrame, col=the column with the dirty HTML we want to clean
    for index, item in df[old_col].iteritems(): #go row by row through the column
        soup = BeautifulSoup(item, "lxml") #turn the current item into a BeautfulSoup object
        washed = soup.get_text(" ",strip=True) #get text from the soup object and store the text in the washed variable
        df.set_value(index,new_col,washed) #update the clean data frame with the washed text
    df[new_col] = df[new_col].str.replace('{(.+)}', ' ') #remove the django tags
    return df

def remove_punc(df,old_col,new_col):
    df[new_col] = df[old_col].str.replace('[^\w\s]',' ') #replaces most punctuation with spaces
    df[new_col] = df[new_col].str.replace('[_]',' ') #replaces underscores with spaces    
    return df

clean = pd.DataFrame(flat) #copy the results from the last step into a new frame

clean['text_clean'] ='' #a new column for storing our squeaky-clean text
clean['tags_clean'] = '' #a new column for storing our squeaky-clean tags
 
clean = clean_soup(clean,'html','text_clean')
clean = remove_punc(clean,'text_clean','text_clean')  
clean = remove_punc(clean,'tag_name','tags_clean')

pd.options.display.max_colwidth = 45
clean[['page_id','html','text_clean','tag_name','tags_clean']].head(4)

Unnamed: 0,page_id,html,text_clean,tag_name,tags_clean
0,10354,<table style=width: 240px; margin-left: 1...,New Zealand s Labour Party laid out clear...,"Trans-Pacific Partnership (TPP), trade ag...",Trans Pacific Partnership TPP trade ag...
1,888,<table cellspacing=0 cellpadding=0 align=...,Breaking news SWN Resources is suing 13 ...,"us corporation, canada, lawsuit, fracking",us corporation canada lawsuit fracking
2,15964,<div style=width: 320px; float: right;>\r...,Google is under fire for leaving Palestin...,#Human_Rights_and_Civil_Liberties,Human Rights and Civil Liberties
3,961,<table cellspacing=0 cellpadding=0 align=...,My daughter Taren set up SumOfUs nearly t...,"donation, recurring",donation recurring


#### At this point, we want to start whittling down this text to the words that will be most helpful for our analysis.  The first step is to remove all words that won't have value as predictors of the topic of the campaign.  Some of these are easy judgements- we know that we don't need extremely common words like conjunctions or pronouns.  But what about other types of words?  Common Nouns? Proper Nouns? Verbs? Adjectives?  Here we get to some value judgements that the computer can't make by itself, but I can write some helper functions that will give me the insight I need to make the decision.  This is where the Natural Language Toolkit (NLTK) comes in.

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import *
#DAN: I would rather not import all these books, but I do need soething from the book module
#Can't find a doc that explains all I am importing so I can be more selective!
from nltk.book import *

def filter_by_pos(df,old_col,new_col,pos_codes,num):
    df[old_col] = df[old_col].str.lower()
    df[new_col] = df.apply(lambda row: nltk.word_tokenize(row[old_col]), axis=1) #tokenize (pre-process) each word    
    pos_tagged = nltk.pos_tag(df[new_col].sum())  #tag each word with a code indicating part of speech
    pos_filtered = [word for word,pos in pos_tagged if pos in pos_codes]  #select all words matching my filter codes
    pos_freq = FreqDist(pos_filtered).most_common(num) #get frequency of filtered words; return the top num
    ##DAN: there must be some cool list comprehension way of doing this, but I couldn't figure it out (and I tried!!)
    for index, row in df.iterrows():  #go through each campaign and remove out all words not in my filer
        camp_filtered = [word for word in df[new_col][index] if word in pos_filtered]
        df.set_value(index, new_col, camp_filtered)
    return df, pos_freq

filtered = pd.DataFrame(clean[['page_id','page_name','text_clean','tags_clean']]) #copy results of last step into a new frame
                                                                                #dropped the old dirty columns!
filtered['text_filtered'] = ''  #new column to store the text filtered by part of speech

    
pos_codes = ['JJ','NN','NNS']  #These codes indicate adjectives, common nouns, and plural common nouns.
                                #I selected these codes after trial and error, trying different filters to see what gave me
                                #the most important words with the least amount of semantic noise

filtered, filtered_freq = filter_by_pos(filtered,'text_clean','text_filtered',pos_codes,10)

filtered_freq  #this sorted frequency distribution helped me choose the pos codes I wanted to keep
               #I'm only showing 10 here, but I reviewed many more while making the decision

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


[('s', 7690),
 ('sumofus', 2716),
 ('t', 2636),
 ('workers', 2342),
 ('information', 2204),
 ('thanks', 2200),
 ('people', 2084),
 ('world', 2018),
 ('oil', 2000),
 ('government', 1884)]

#### Here is what those words look like within each campaigns:

In [95]:
pd.options.display.max_colwidth =90
filtered[['page_id','text_clean','text_filtered']].head(4)

Unnamed: 0,page_id,text_clean,text_filtered
0,10354,new zealand s labour party laid out clear requirements to make sure the tppa deal actu...,"[new, zealand, s, labour, party, out, clear, requirements, make, sure, tppa, deal, new..."
1,888,breaking news swn resources is suing 13 mi kmaq warriors for defending their communit...,"[breaking, news, swn, resources, mi, kmaq, warriors, community, dangerous, practice, f..."
2,15964,google is under fire for leaving palestine off of google maps tell google to recogniz...,"[google, fire, palestine, off, google, maps, tell, google, recognize, palestinian, sel..."
3,961,my daughter taren set up sumofus nearly two years ago i m enourmously proud of what w...,"[daughter, taren, set, up, sumofus, years, i, m, proud, ve, together, sumofus, shoestr..."


#### That is the strongest list of words I could generate with a part of speech filter, but it still isn't great.  It has some random noise (like 's' and 't'),  generic words that are common in any text (like 'other' and 'last), and words that are very common in our specific campaigns, (like 'sumofus', 'petition' and 'profits').  But we want to be sure to keep all the words that could be predictive for us ('oil', 'water, 'palm', climate', 'food').  We can do that by applying a list of specific words, called stopwords, that we want to exclude from our analysis.

#### However, I first need to grab the words from the 'tag_name' column and add them to the mailing text.  I couldn't do that until after I ran the part of speech filter, because NLTK needs to have sentences in context in order to  properly mark the part of speech; adding short phrases would have created problems.  But from here on in, I will be dealing with collections of words, where the order doesn't matter.  Because the tags were specifically chosen to convey topic information, I have chosen to give them 3x more weight than the regular mailing text. 

In [7]:
def concat_cols(df, filtered_col, unfiltered_col, new_col, coef):
    weighted = ((df[unfiltered_col].str.lower()+' ')*coef) #multiply the unfiltered column by the desired coefficient
    df[new_col] = weighted.apply(lambda row: nltk.word_tokenize(row)) #tokenize (pre-process) each word 
    df[new_col] = df[new_col] + df[filtered_col]
    #df[new_col] = merged.str.lower()
    return df

merged = pd.DataFrame(filtered[['page_id','page_name','text_filtered','tags_clean']]) #copy results of last step into a new frame
merged['text_merged'] = '' # for combined result of text and tags                     

merged = concat_cols(merged,'text_filtered','tags_clean','text_merged',3)

pd.options.display.max_colwidth =60
merged[['page_id','text_filtered','tags_clean','text_merged']].head(4)

Unnamed: 0,page_id,text_filtered,tags_clean,text_merged
0,10354,"[new, zealand, s, labour, party, out, clear, requirement...",Trans Pacific Partnership TPP trade agreements trade,"[trans, pacific, partnership, tpp, trade, agreements, tr..."
1,888,"[breaking, news, swn, resources, mi, kmaq, warriors, com...",us corporation canada lawsuit fracking,"[us, corporation, canada, lawsuit, fracking, us, corpora..."
2,15964,"[google, fire, palestine, off, google, maps, tell, googl...",Human Rights and Civil Liberties,"[human, rights, and, civil, liberties, human, rights, an..."
3,961,"[daughter, taren, set, up, sumofus, years, i, m, proud, ...",donation recurring,"[donation, recurring, donation, recurring, donation, rec..."


#### That taken care of, I can focus on the stop words.   I already have a list of generic stopwords to get me started, but others I have to add by hand.  Again, I can write a helper program in NLTK to make it easier.  I will display all the words by frequency again, just like when we were doing the past of speech filtering, but this time I will also subtract my generic list of stopwords.  When I look through the list of what is left, I will undoubtedly find more stop words, so I will do it again, and again ... this is a manual process, but it the end we end up with a list of highly relevant words.

In [30]:
def exclude_stopwords(df,old_col,new_col,num):
    stop_words = set(stopwords.words('english')) #read in my text file of stopwords   
    df[new_col] = df[old_col].apply(lambda x:[word for word in x if word not in stop_words]) #remove the stop words
    ready_freq = FreqDist(df[new_col].sum()).most_common(num) #return the top words that are left by frequency
    return ready, ready_freq

go = pd.DataFrame(merged[['page_id','page_name','text_merged']]) #copy results of last step into a new frame
go['text_go'] = ''
    
go, go_freq = exclude_stopwords(ready,'text_merged','text_go',10)

go_freq #I use this list to find common words that I don't want in my analysis, then manually add them the stoplist

go_freq

[('oil', 2342),
 ('food', 1808),
 ('trade', 1502),
 ('water', 1091),
 ('climate', 1003),
 ('palm', 999),
 ('tpp', 990),
 ('industry', 946),
 ('health', 887),
 ('tax', 867)]

In [31]:
# stem the filtered tokens
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

#df[new_col] = df[old_col].apply(lambda x:[word for word in x if word not in stop_words])

def stem(df,old_col,new_col):
    df[new_col] = df[old_col].apply(lambda x: [porter.stem(word) for word in x])
    df[new_col] = df[new_col].apply(lambda x: [snowball.stem(word) for word in x])
    df[new_col] = df[new_col].apply(lambda x: [lancaster.stem(word) for word in x])
    return df

stemmed = pd.DataFrame(ready[['page_id','page_name','text_ready']]) #copy results of last step into a new frame
stemmed['text_stemmed'] = ''

stemmed = stem(stemmed,'text_ready','text_stemmed')

pd.options.display.max_colwidth =90
stemmed[['page_id','text_ready','text_stemmed']].head(4)

Unnamed: 0,page_id,text_ready,text_stemmed
0,10354,"[tpp, trade, agreements, trade, tpp, trade, agreements, trade, tpp, trade, agreements,...","[tpp, trad, agr, trad, tpp, trad, agr, trad, tpp, trad, agr, trad, requir, tppa, zeala..."
1,888,"[fracking, fracking, fracking, swn, mi, kmaq, warriors, fracking, swn, mi, kmaq, elsip...","[frack, frack, frack, swn, mi, kmaq, warry, frack, swn, mi, kmaq, elsipogtog, warry, l..."
2,15964,"[liberties, liberties, liberties, palestine, maps, recognize, palestinian, determinati...","[libert, libert, libert, palestin, map, recogn, palestin, determin, palestin, map, pal..."
3,961,"[recurring, recurring, recurring, daughter, shoestring, budget, daughter, harm, inhabi...","[rec, rec, rec, daught, shoest, budget, daught, harm, inhabit, bangladesh, bab, seal, ..."


#### Now, at long last, we are finally able to get to the good stuff and build our model!  The first step is to take our painstakingly cleaned and filtered text and create a giant "bag of words", called a corpus, to feed into a Latent Dirichlet Allocation (LDA) model.  Finally we will see some actual topics!

In [34]:
from gensim import corpora, models

def generate_lda(df,col,topics,words):
    camp_corpus = df[col].tolist()
    dictionary = corpora.Dictionary(camp_corpus)
    corpus = [dictionary.doc2bow(text) for text in camp_corpus]
    lda = models.ldamodel.LdaModel(corpus, num_topics=topics, id2word = dictionary, passes=100)
    topics = lda.print_topics(num_topics=topics,num_words=words)
    return lda, corpus, dictionary, topics

lda, corpus, dictionary, all_topics = generate_lda(stemmed,'text_stemmed',8,2)

all_topics

[(0, '0.048*wat + 0.021*min'),
 (1, '0.049*food + 0.043*gmo'),
 (2, '0.046*tax + 0.027*bank'),
 (3, '0.016*med + 0.014*priv'),
 (4, '0.024*wag + 0.024*food'),
 (5, '0.046*oil + 0.034*palm'),
 (6, '0.039*clim + 0.038*oil'),
 (7, '0.089*trad + 0.058*tpp')]

In [35]:
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis)

topic_cols = {0:'extractives',1:'food & pesticides',2:'economic',3:'other',4:'worker rights',5:'palm oil & animals',6:'fossil fuels',7:'trade'}

#### Now we have our topics, the final step is to assign topics to each campaign

In [133]:
# freq = FreqDist(df[old_col][0]).most_common(5) #return the top words that are left by frequency
assigned = pd.DataFrame(stemmed[['page_id','page_name','text_stemmed']])  #copy results of last step into a new frame
assigned['freq'] = ''
assigned['topics'] = ''

def calc_freq(df,old_col,new_col,num):
    #DAN: must be a more pythonic way of doing this, right?
    for index, row in df.iterrows():
        camp_freq = df['text_stemmed'][index]
        dist = FreqDist(camp_freq)
        df.set_value(index,'freq',dist.most_common(num))
    return df

def assign_topics(df,old_col,new_col,num):
    #DAN: This one too!  I could only figure these out with loops
    for index, row in df.iterrows():
        t = df[old_col][index]
        doc_bow = dictionary.doc2bow(t)
        camp_tops = lda.get_document_topics(doc_bow,num)
        df.set_value(index,new_col, camp_tops)
    return df

assigned = calc_freq(assigned,'text_stemmed','freq',20)
assigned = assign_topics(assigned,'text_stemmed','topics',0)

pd.options.display.max_colwidth =150
assigned.head(5)

Unnamed: 0,page_id,page_name,text_stemmed,freq,topics
0,10354,-NZ-Labour-TPPA,"[tpp, trad, agr, trad, tpp, trad, agr, trad, tpp, trad, agr, trad, requir, tppa, zealand, requir, met, tppa, sum, requir, tppa, benefit, zealand, ...","[(tppa, 8), (trad, 6), (requir, 5), (tpp, 4), (agr, 3), (met, 3), (zealand, 2), (lawmak, 2), (demand, 2), (round, 2), (commit, 2), (benefit, 1), (...","[(0, 0.323723440887), (1, 0.00216012884879), (2, 0.00215818312947), (3, 0.0247933723362), (4, 0.00215920567646), (5, 0.00215943290304), (6, 0.0021..."
1,888,13-first-nations-land-defenders-sued-fracking-swn,"[frack, frack, frack, swn, mi, kmaq, warry, frack, swn, mi, kmaq, elsipogtog, warry, land, unpopul, shal, ga, frack, threaten, reg, wat, suppl, he...","[(swn, 7), (land, 6), (defend, 6), (frack, 6), (ga, 4), (peac, 4), (blockad, 3), (heal, 3), (mi, 3), (kmaq, 3), (tear, 2), (protest, 2), (bullet, ...","[(0, 0.728813208191), (1, 0.0298490164726), (2, 0.00133128342978), (3, 0.106737848262), (4, 0.0013318549415), (5, 0.0013303081418), (6, 0.12927341..."
2,15964,136-un-members-recognise-palestine-google-doesn-t-petition,"[libert, libert, libert, palestin, map, recogn, palestin, determin, palestin, map, palestin, map, crim, resid, determin, map, label, palestin, sup...","[(palestin, 19), (map, 11), (label, 4), (occup, 3), (libert, 3), (determin, 2), (sovereignt, 2), (eras, 2), (territor, 2), (explain, 1), (journ, 1...","[(0, 0.410691187236), (1, 0.00164856918834), (2, 0.00164662681948), (3, 0.0956757000012), (4, 0.0016471589682), (5, 0.00164993574785), (6, 0.00164..."
3,961,2013-achievements,"[rec, rec, rec, daught, shoest, budget, daught, harm, inhabit, bangladesh, bab, seal, namib, wat, drink, bee, die, nobl, impract, tin, behemo, sha...","[(sustain, 8), (daught, 4), (rec, 3), (build, 2), (support, 2), (cor, 2), (compass, 2), (bangladesh, 2), (sum, 2), (haul, 2), (behemo, 1), (thrive...","[(0, 0.0542747692519), (1, 0.105724343756), (2, 0.12209109236), (3, 0.0371955449358), (4, 0.43442655282), (5, 0.0982505670115), (6, 0.146547334257..."
4,2446,2015-monthly,"[formid, stag, journey, sustain, holiday, season, prett, hop, build, build, cre, sustain, bit, hop, act, gen, don, wat, research, jessic, min, imp...","[(trad, 5), (wat, 4), (hop, 4), (bee, 4), (build, 3), (pesticid, 3), (check, 3), (nat, 3), (reef, 3), (destruct, 2), (green, 2), (sydney, 2), (met...","[(0, 0.37032188467), (1, 0.144121953731), (2, 0.000626040621948), (3, 0.0205986223302), (4, 0.0799082357247), (5, 0.0799147716869), (6, 0.12940517..."


In [175]:
to_pivot = pd.DataFrame(assigned[['page_id','page_name','freq','topics']])
to_pivot['stripped'] = ''
to_pivot['stripped'] = [[tuple[1] for tuple in topic] for topic in to_pivot['topics']]
pivoted = pd.DataFrame(to_pivot['stripped'].tolist())
pivoted.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.323723,0.00216,0.002158,0.024793,0.002159,0.002159,0.002159,0.640687
1,0.728813,0.029849,0.001331,0.106738,0.001332,0.00133,0.129273,0.001333
2,0.410691,0.001649,0.001647,0.095676,0.001647,0.00165,0.001647,0.485394
3,0.054275,0.105724,0.122091,0.037196,0.434427,0.098251,0.146547,0.00149


In [182]:
identifiers = pd.DataFrame(to_pivot[['page_id','freq']])

topics = pd.DataFrame(pivoted)
topic_cols = {0:'extract',1:'pest',2:'econ',3:'other',4:'workers',5:'palm',6:'fossil',7:'trade'}
topics = topics.rename(columns=topic_cols)

merged = pd.concat([identifiers, topics], axis=1)

pd.options.display.max_colwidth =100
merged

Unnamed: 0,page_id,freq,extract,pest,econ,other,workers,palm,fossil,trade
0,10354,"[(tppa, 8), (trad, 6), (requir, 5), (tpp, 4), (agr, 3), (met, 3), (zealand, 2), (lawmak, 2), (de...",0.323723,0.002160,0.002158,0.024793,0.002159,0.002159,0.002159,0.640687
1,888,"[(swn, 7), (land, 6), (defend, 6), (frack, 6), (ga, 4), (peac, 4), (blockad, 3), (heal, 3), (mi,...",0.728813,0.029849,0.001331,0.106738,0.001332,0.001330,0.129273,0.001333
2,15964,"[(palestin, 19), (map, 11), (label, 4), (occup, 3), (libert, 3), (determin, 2), (sovereignt, 2),...",0.410691,0.001649,0.001647,0.095676,0.001647,0.001650,0.001647,0.485394
3,961,"[(sustain, 8), (daught, 4), (rec, 3), (build, 2), (support, 2), (cor, 2), (compass, 2), (banglad...",0.054275,0.105724,0.122091,0.037196,0.434427,0.098251,0.146547,0.001490
4,2446,"[(trad, 5), (wat, 4), (hop, 4), (bee, 4), (build, 3), (pesticid, 3), (check, 3), (nat, 3), (reef...",0.370322,0.144122,0.000626,0.020599,0.079908,0.079915,0.129405,0.175103
5,2447,"[(trad, 5), (wat, 4), (hop, 4), (bee, 4), (build, 3), (pesticid, 3), (check, 3), (nat, 3), (reef...",0.370323,0.144122,0.000626,0.020606,0.079907,0.079908,0.129405,0.175102
6,2513,"[(trad, 8), (ttip, 4), (tpp, 4), (invest, 3), (transatl, 3), (elect, 1), (scar, 1), (offic, 1), ...",0.004167,0.004167,0.004168,0.004167,0.004167,0.004167,0.004167,0.970830
7,850,"[(tar, 12), (tax, 9), (advert, 6), (spend, 6), (conserv, 4), (promot, 4), (extract, 4), (tarsand...",0.077630,0.051255,0.261177,0.001089,0.001088,0.043360,0.563313,0.001088
8,462,"[(tobacco, 13), (reynold, 10), (farmwork, 9), (exploit, 2), (conven, 2), (field, 2), (poison, 2)...",0.035007,0.041472,0.535646,0.001391,0.360764,0.001390,0.022940,0.001389
9,1529,"[(cut, 5), (med, 3), (priv, 3), (account, 3), (meddl, 3), (brok, 3), (bal, 2), (prom, 2), (aust,...",0.001627,0.001628,0.350233,0.438302,0.001626,0.001625,0.105464,0.099495
