In [1]:
import pandas as pd
import mongo_work  as mon

In [2]:
# For long running queries so I can be notified when it finishes

from IPython.display import Audio
sound_file = 'me-too.mp3'

# Text Preprocessing

I will be analyzing transcripts from the _This American Life podcasts_. All the code for getting the data is in 'scraping.py' for review. I will start from the text preprossing part of the project. 

## Getting the data from Mongo

The data is stored in a local mongo database. You can review the mongo_work.py for the logic to put data into my local database and the logic to pull data from the database into a pandas dataframe as shown in the next cell.

In [3]:
full_data = mon.get_episodes()
full_data.sample(3)

Unnamed: 0,ep_num,ep_title,ep_air_date,ep_summary,speaker,words,timestamp,act
120337,464,Invisible Made Visible,"May 18, 2012","David Sedaris, Tig Notaro, Ryan Knighton, and ...",Ira Glass,This American Life is...,00:56:12_15,Act Four: Turn Around Bright Eyes
25891,53,Valentine’s Day ‘97,"Feb. 7, 1997",Stories about our parents falling in love.,Brett Leveridge,"Soon, Dad felt a tap, tap, tap upon his should...",00:31:02_03,Act Three: It's Not the Heat
45522,172,24 Hours at the Golden Apple,"Nov. 17, 2000",One day in a Chicago diner.,Danielle,[LAUGHS] You can't go anywhere? Get in the car.,00:40:42_20,Act 2


In [4]:
#words = full_data['words']

# For now, group words by podcast episode
words = full_data.groupby(by='ep_num').sum()['words']
words.shape

(688,)

## Data cleaning

As recommended, we first remove all numbers/punctuation and lower all capitals.

In [5]:
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

words = words.map(alphanumeric).map(punc_lower)
words.sample(5)

ep_num
380    a couple years ago i interviewed this cop abou...
307    ok  this happens to be chicago  but every city...
211    when you name names  when you snitch  when you...
319    well  i hold in my hand a ghost story that com...
25     the story goes like this  recently i heard abo...
Name: words, dtype: object

**Pickling for later**

In [6]:
# import pickle

# with open('post_data_clean_podcast_words.pickle', 'wb') as to_write:
#     pickle.dump(words, to_write)

In [7]:
import pickle

with open('post_data_clean_podcast_words.pickle','rb') as read_file:
    words = pickle.load(read_file)

In [5]:
## TODO: There are words bracketed like [SPEAKING SPANISH]. What do?

## Document term matrix format

Now we remove all stop words and convert to document term matrix

### Count vectorizer (and removing stop words)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

In [41]:
# add to the typical list of English stop words
stop_words = text.ENGLISH_STOP_WORDS.union(['just'])

cv1 = CountVectorizer(stop_words=stop_words, max_df=.9, min_df=.3)
transformed_matrix = cv1.fit_transform(words)


### Topic modeling: LDA

In [17]:
from gensim import corpora, models, similarities, matutils

In [42]:
corpus = matutils.Sparse2Corpus(transformed_matrix)
id2word = dict((v, k) for k, v in cv1.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, num_topics=2, id2word=id2word, passes=60)
lda.print_topics()
Audio(sound_file, autoplay=True)

In [44]:
lda.print_topics()

[(0,
  '0.003*"clear" + 0.003*"light" + 0.003*"hell" + 0.003*"close" + 0.002*"led" + 0.002*"movie" + 0.002*"eye" + 0.002*"outside" + 0.002*"including" + 0.002*"post"'),
 (1,
  '0.004*"perfectly" + 0.004*"laughs" + 0.004*"lost" + 0.004*"huh" + 0.004*"opportunity" + 0.004*"closed" + 0.004*"needs" + 0.004*"making" + 0.004*"fear" + 0.004*"game"')]

## Clustering

In [8]:
from sklearn.cluster import KMeans
word_count_array = pd.DataFrame(transformed_matrix.toarray(), columns=cv1.get_feature_names())
subset = word_count_array.head(80000)
km = KMeans(n_clusters=20)
km.fit(subset)