In [3]:
import pandas as pd
import mongo_work  as mon

# Text Preprocessing

I will be analyzing transcripts from the _This American Life podcasts_. All the code for getting the data is in 'scraping.py' for review. I will start from the text preprossing part of the project. 

The data is stored in a local mongo database. You can review the mongo_work.py for the logic to put data into my local database and the logic to pull data from the database into a pandas dataframe as shown in the next cell.

In [4]:
full_data = mon.get_episodes()
full_data.sample(3)

Unnamed: 0,ep_num,ep_title,ep_air_date,ep_summary,speaker,words,timestamp,act
145691,575,Poetry of Propaganda,"Dec. 18, 2015",Propaganda is complexity in the form of simpli...,Ira Glass,When Damien Cave moved to Mexico and started r...,00:00:07_51,Prologue
98028,395,Middle of the Night,"Nov. 27, 2009",Stories of people who are up while the rest of...,Angela Poricelli,"Yes. I agree with him there. That I do. Plus, ...",00:12:08_08,Act One: Orange You Glad I Didn’t Say Banana?
169238,610,Grand Gesture,"Feb. 17, 2017",People going to very extreme measures to demon...,Caitlin Mitchell,"Yeah. There was snow in my yard, so-- I mean--",00:03:23_00,Prologue


While there is a lot of metadata I kept for each row, I will be working on cleaining up the 'words' column as it contains the actual text of the podcast:

In [5]:
words = full_data['words']

As recommended, we first remove all numbers/punctuation and lower all capitals.

In [6]:
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

words = words.map(alphanumeric).map(punc_lower)
words.sample(5)

125225    it was  like      or     in the morning  i tol...
109532    this girl right here just grabbed onto the bac...
130665    that is a way to estimate how many planets are...
162328    yeah  they would show people where the room   ...
84917     you have reached the answering machine of fred...
Name: words, dtype: object

In [29]:
import nltk

ModuleNotFoundError: No module named 'nltk'

In [None]:
## TO DO: There are words bracketed like [SPEAKING SPANISH]. What do?

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')
transformed_matrix = cv1.fit_transform(words)
word_count_array = pd.DataFrame(transformed_matrix.toarray(), columns=cv1.get_feature_names()).head()

In [8]:
word_count_array.sample(5)

Unnamed: 0,aa,aaa,aaaaaaa,aaaaagh,aaaaah,aaaaahh,aaaah,aaagh,aaah,aaas,...,zulus,zuma,zundel,zuni,zuniga,zus,zygotic,zyprexa,zyuganov,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
pd.DataFrame(word_count_array.columns).sample(5)

Unnamed: 0,0
5616,borelli
15916,elaborateness
36580,paintball
48968,stealth
48665,sro


In [4]:
client = MongoClient()
db = client.tal_pod

In [5]:
db.list_collection_names()

['episodes']

In [12]:
cursor = db.episodes.find({}, {'ep':1, 'title': 1, '_id':0})
list(cursor)

[{'ep': '688', 'title': 'The Out Crowd'},
 {'ep': '667', 'title': 'Wartime Radio'},
 {'ep': '671', 'title': 'Anything Can Be Anything'},
 {'ep': '684', 'title': 'Burn It Down'},
 {'ep': '685', 'title': 'We Come From Small Places'},
 {'ep': '670', 'title': 'Beware the Jabberwock'},
 {'ep': '689', 'title': 'Digging Up the Bones'},
 {'ep': '666', 'title': 'The Theme That Shall Not Be Named'},
 {'ep': '682', 'title': 'Ten Sessions'},
 {'ep': '677', 'title': 'Seeing Yourself In the Wild'},
 {'ep': '676', 'title': 'Here’s Looking at You, Kid'},
 {'ep': '683', 'title': 'Beer Summit'},
 {'ep': '679', 'title': 'Save the Girl'},
 {'ep': '680', 'title': 'The Weight Of Words'},
 {'ep': '675', 'title': 'I’m on TV??'},
 {'ep': '674', 'title': 'Get a Spine!'},
 {'ep': '681', 'title': 'Escape From the Lab'},
 {'ep': '678', 'title': 'The Wannabes'},
 {'ep': '673', 'title': 'Left Behind'},
 {'ep': '665', 'title': 'Before Things Went to Hell'},
 {'ep': '690', 'title': 'Too Close to Home'},
 {'ep': '669',

[]

In [10]:
db['episodes'].drop()