In [1]:
import pandas as pd
import mongo_work  as mon

# Text Preprocessing

I will be analyzing transcripts from the _This American Life podcasts_. All the code for getting the data is in 'scraping.py' for review. I will start from the text preprossing part of the project. 

The data is stored in a local mongo database. You can review the mongo_work.py for the logic to put data into my local database and the logic to pull data from the database into a pandas dataframe as shown in the next cell.

In [2]:
full_data = mon.get_episodes()
full_data.sample(3)

Unnamed: 0,ep_num,ep_title,ep_air_date,ep_summary,speaker,words,timestamp,act
525,667,Wartime Radio,"Feb. 1, 2019",Intimate and personal dispatches from two very...,Artie Lange,Stay?,00:20:02_42,Act One: Two Dope Kings
95926,386,Fine Print,"July 24, 2009",Stories where the fine print changes everythin...,Omid Memarian,A few times I cried. It was unintentional and ...,00:25:20_00,Act One: Side Effects May Include
144445,568,Human Spectacle 2015,"Oct. 2, 2015",A Japanese reality show contestant has to ente...,Charlie Brill,Forget the Dixie.,00:56:08_97,"Act Three: Take My Break, Please"


While there is a lot of metadata I kept for each row, I will be working on cleaining up the 'words' column as it contains the actual text of the podcast:

In [3]:
words = full_data['words']

As recommended, we first remove all numbers/punctuation and lower all capitals.

In [4]:
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

words = words.map(alphanumeric).map(punc_lower)
words.sample(5)

109432    this is the time of biggest growth for a human...
113929    there was just this chatter that overtook the ...
97171                                     acknowledgements 
25175                                             oh  yeah 
112625    i remember thinking after  as i m decelerating...
Name: words, dtype: object

Now we remove all stop words.

In [5]:
## TO DO: There are words bracketed like [SPEAKING SPANISH]. What do?

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')
transformed_matrix = cv1.fit_transform(words)
word_count_array = pd.DataFrame(transformed_matrix.toarray(), columns=cv1.get_feature_names())

In [7]:
word_count_array

Unnamed: 0,aa,aaa,aaaaaaa,aaaaagh,aaaaah,aaaaahh,aaaah,aaagh,aaah,aaas,...,zulus,zuma,zundel,zuni,zuniga,zus,zygotic,zyprexa,zyuganov,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180541,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
180542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
180543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
180544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.cluster import KMeans

In [12]:
subset = word_count_array.head(80000)

In [None]:
km = KMeans(n_clusters=20)
km.fit(subset)

In [4]:
client = MongoClient()
db = client.tal_pod

In [5]:
db.list_collection_names()

['episodes']

In [12]:
cursor = db.episodes.find({}, {'ep':1, 'title': 1, '_id':0})
list(cursor)

[{'ep': '688', 'title': 'The Out Crowd'},
 {'ep': '667', 'title': 'Wartime Radio'},
 {'ep': '671', 'title': 'Anything Can Be Anything'},
 {'ep': '684', 'title': 'Burn It Down'},
 {'ep': '685', 'title': 'We Come From Small Places'},
 {'ep': '670', 'title': 'Beware the Jabberwock'},
 {'ep': '689', 'title': 'Digging Up the Bones'},
 {'ep': '666', 'title': 'The Theme That Shall Not Be Named'},
 {'ep': '682', 'title': 'Ten Sessions'},
 {'ep': '677', 'title': 'Seeing Yourself In the Wild'},
 {'ep': '676', 'title': 'Here’s Looking at You, Kid'},
 {'ep': '683', 'title': 'Beer Summit'},
 {'ep': '679', 'title': 'Save the Girl'},
 {'ep': '680', 'title': 'The Weight Of Words'},
 {'ep': '675', 'title': 'I’m on TV??'},
 {'ep': '674', 'title': 'Get a Spine!'},
 {'ep': '681', 'title': 'Escape From the Lab'},
 {'ep': '678', 'title': 'The Wannabes'},
 {'ep': '673', 'title': 'Left Behind'},
 {'ep': '665', 'title': 'Before Things Went to Hell'},
 {'ep': '690', 'title': 'Too Close to Home'},
 {'ep': '669',

[]

In [10]:
db['episodes'].drop()