In [1]:
#import modules
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200

####  Add something about how the campaigns were picked here

In [2]:
#read in the CSV file generated by SQL
camp_txt = pd.read_csv('../capstone/text_fields.csv')
camp_txt.head(4)

Unnamed: 0,page_id,page_name,tag_name,html
0,400,time-warner-al-jazeera,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...
1,400,time-warner-al-jazeera,discrimination,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...
2,401,hbo-animal-cruelty,us corporation,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...
3,401,hbo-animal-cruelty,animal abuse,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...


#### Notice that the structure above has one row for each tag. Campaigns with more than one tag are duplicated. In order to analyze the text, we want each campaign on one and only one row. This means flattening those tags.

In [3]:
flat = pd.DataFrame(camp_txt)

def flatten_frame(df,col):  #df= DataFrame, col= the column to be flattened; in our case, 'tag_name'
    headers = list(df.columns.values) #pull in the list of columns
    group_cols = list(set(headers) - set([col])) #get the cols to group on by subtracting the one we are flattening
    df = pd.DataFrame(df.groupby(by=(group_cols))[col].apply(list)).reset_index() #group and reset index
    df[col] = df[col].apply(', '.join) #convert the flattened col of tags into a string
    return df

flat = flatten_frame(flat,'tag_name')
flat.head(4)

Unnamed: 0,page_id,html,page_name,tag_name
0,400,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...,time-warner-al-jazeera,"us corporation, discrimination"
1,401,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...,hbo-animal-cruelty,"us corporation, animal abuse"
2,402,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...,gm-strike,"us corporation, working conditions, workers"
3,403,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<td id=boxholder>\r\n<table style=border: 1px solid grey; margin-left: 10px; margin-bottom: 5px; width: 220px; cellspacing=0 c...,boeing-dreamliner-fire,"us corporation, consumer safety"


#### Now each campaign is on a single row, but we have to extract plain text from that ugly HTML/Django.  Luckily, a module called 'Beautiful Soup' will come to our rescue. While I am it, I'll take out all the punctuation from both the html and tag_name fields; we will need that done before the next stage of the analysis.


In [4]:
from bs4 import BeautifulSoup
import re

def clean_soup(df,old_col,new_col): # df=DataFrame, col=the column with the dirty HTML we want to clean
    for index, item in df[old_col].iteritems(): #go row by row through the column
        soup = BeautifulSoup(item, "lxml") #turn the current item into a BeautfulSoup object
        washed = soup.get_text(" ",strip=True) #get text from the soup object and store the text in the washed variable
        df.set_value(index,new_col,washed) #update the clean data frame with the washed text
    df[new_col] = df[new_col].str.replace('{(.+)}', ' ') #remove the django tags
    return df

def remove_punc(df,old_col,new_col):
    df[new_col] = df[old_col].str.replace('[^\w\s]',' ') #replaces most punctuation with spaces
    df[new_col] = df[new_col].str.replace('[_]',' ') #replaces underscores with spaces    
    return df

clean = pd.DataFrame(flat) #copy the results from the last step into a new frame

clean['text_clean'] ='' #a new column for storing our squeaky-clean text
clean['tags_clean'] = '' #a new column for storing our squeaky-clean tags
 
clean = clean_soup(clean,'html','text_clean')
clean = remove_punc(clean,'text_clean','text_clean')  
clean = remove_punc(clean,'tag_name','tags_clean')

pd.options.display.max_colwidth = 75
clean[['page_id','html','text_clean','tag_name','tags_clean']].head(4)

Unnamed: 0,page_id,html,text_clean,tag_name,tags_clean
0,400,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<...,In a blatantly prejudiced move Time Warner Cable dropped CurrentTV the ...,"us corporation, discrimination",us corporation discrimination
1,401,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<...,A new lawsuit claims that animal abuse by HBO led to the death of four ...,"us corporation, animal abuse",us corporation animal abuse
2,402,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<...,This man has stitched his lips together and declared a hunger strike de...,"us corporation, working conditions, workers",us corporation working conditions workers
3,403,<table cellspacing=0 cellpadding=0 align=right>\r\n<tbody>\r\n<tr>\r\n<...,Boeing s new 787 Dreamliners keep catching on fire Something is clearl...,"us corporation, consumer safety",us corporation consumer safety


#### At this point, we want to start whittling down this text to the words that will be most helpful for our analysis.  The first step is to remove all words that won't have value as predictors of the topic of the campaign.  Some of these are easy judgements- we know that we don't need extremely common words like conjunctions or pronouns.  But what about other types of words?  Common Nouns? Proper Nouns? Verbs? Adjectives?  Here we get to some value judgements that the computer can't make by itself, but I can write some helper functions that will give me the insight I need to make the decision.  This is where the Natural Language Toolkit (NLTK) comes in.

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import *
#DAN: I would rather not import all these books, but I do need soething from the book module
#Can't find a doc that explains all I am importing so I can be more selective!
from nltk.book import *

def filter_by_pos(df,old_col,new_col,pos_codes,num):
    df[old_col] = df[old_col].str.lower()
    df[new_col] = df.apply(lambda row: nltk.word_tokenize(row[old_col]), axis=1) #tokenize (pre-process) each word    
    pos_tagged = nltk.pos_tag(df[new_col].sum())  #tag each word with a code indicating part of speech
    pos_filtered = [word for word,pos in pos_tagged if pos in pos_codes]  #select all words matching my filter codes
    pos_freq = FreqDist(pos_filtered).most_common(num) #get frequency of filtered words; return the top num
    ##DAN: there must be some cool list comprehension way of doing this, but I couldn't figure it out (and I tried!!)
    for index, row in df.iterrows():  #go through each campaign and remove out all words not in my filer
        camp_filtered = [word for word in df[new_col][index] if word in pos_filtered]
        df.set_value(index, new_col, camp_filtered)
    return df, pos_freq

filtered = pd.DataFrame(clean[['page_id','page_name','text_clean','tags_clean']]) #copy results of last step into a new frame
                                                                                #dropped the old dirty columns!
filtered['text_filtered'] = ''  #new column to store the text filtered by part of speech

    
pos_codes = ['JJ','NN','NNS']  #These codes indicate adjectives, common nouns, and plural common nouns.
                                #I selected these codes after trial and error, trying different filters to see what gave me
                                #the most important words with the least amount of semantic noise

filtered, filtered_freq = filter_by_pos(filtered,'text_clean','text_filtered',pos_codes,10)

filtered_freq  #this sorted frequency distribution helped me choose the pos codes I wanted to keep
               #I'm only showing 10 here, but I reviewed many more while making the decision

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


#### Here is what those words look like within each campaigns:

In [None]:
pd.options.display.max_colwidth = 200
filtered[['page_id','text_clean','text_filtered']].head(4)

#### That is the strongest list of words I could generate with a part of speech filter, but it still isn't great.  It has some random noise (like 's' and 't'),  generic words that are common in any text (like 'other' and 'last), and words that are very common in our specific campaigns, (like 'sumofus', 'petition' and 'profits').  But we want to be sure to keep all the words that could be predictive for us ('oil', 'water, 'palm', climate', 'food').  We can do that by applying a list of specific words, called stopwords, that we want to exclude from our analysis.

#### However, I first need to grab the words from the 'tag_name' column and add them to the mailing text.  I couldn't do that until after I ran the part of speech filter, because NLTK needs to have sentences in context in order to  properly mark the part of speech; adding short phrases would have created problems.  But from here on in, I will be dealing with collections of words, where the order doesn't matter.  Because the tags were specifically chosen to convey topic information, I have chosen to give them 3x more weight than the regular mailing text. 

In [None]:
def concat_cols(df, filtered_col, unfiltered_col, new_col, coef):
    weighted = ((df[unfiltered_col].str.lower()+' ')*coef) #multiply the unfiltered column by the desired coefficient
    df[new_col] = weighted.apply(lambda row: nltk.word_tokenize(row)) #tokenize (pre-process) each word 
    df[new_col] = df[new_col] + df[filtered_col]
    #df[new_col] = merged.str.lower()
    return df

merged = pd.DataFrame(filtered[['page_id','page_name','text_filtered','tags_clean']]) #copy results of last step into a new frame
merged['text_merged'] = '' # for combined result of text and tags                     

merged = concat_cols(merged,'text_filtered','tags_clean','text_merged',3)
merged.head(4)

#### That taken care of, I can focus on the stop words.   I already have a list of generic stopwords to get me started, but others I have to add by hand.  Again, I can write a helper program in NLTK to make it easier.  I will display all the words by frequency again, just like when we were doing the past of speech filtering, but this time I will also subtract my generic list of stopwords.  When I look through the list of what is left, I will undoubtedly find more stop words, so I will do it again, and again ... this is a manual process, but it the end we end up with a list of highly relevant words.

In [None]:
def exclude_stopwords(df,old_col,new_col,num):
    stop_words = set(stopwords.words('english')) #read in my text file of stopwords   
    df[new_col] = df[old_col].apply(lambda x:[item for item in x if item not in stop_words]) #remove the stop words
    ready_freq = FreqDist(df[new_col].sum()).most_common(num) #return the top words that are left by frequency
    return ready, ready_freq

ready = pd.DataFrame(merged[['page_id','page_name','text_merged']]) #copy results of last step into a new frame
ready['text_ready'] = ''
    
ready, ready_freq = exclude_stopwords(ready,'text_merged','text_ready',10)

ready_freq #I use this list to find common words that I don't want in my analysis, then manually add them the stoplist

ready_freq

#### Now, at long last, we are finally able to get to the good stuff and build our model!  The first step is to take our painstakingly cleaned and filtered text and create a giant "bag of words", called a corpus, to feed into a Latent Dirichlet Allocation (LDA) model.  Finally we will see some actual topics!

In [None]:
from gensim import corpora, models

def generate_lda(df,col,topics,words):
    camp_corpus = df[col].tolist()
    dictionary = corpora.Dictionary(camp_corpus)
    corpus = [dictionary.doc2bow(text) for text in camp_corpus]
    lda = models.ldamodel.LdaModel(corpus, num_topics=topics, id2word = dictionary, passes=75)
    topics = lda.print_topics(num_topics=topics,num_words=words)
    return lda, corpus, dictionary, topics

lda, corpus, dictionary, all_topics = generate_lda(ready,'text_ready',15,3)

all_topics

In [None]:
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis)