In [12]:
import pandas as pd
import mongo_work  as mon

# Text Preprossing

I will be analyzing transcripts from the _This American Life podcasts_. All the code for getting the data is in 'scraping.py' for review. I will start from the text preprossing part of the project. 

The data is stored in a local mongo database. You can review the mongo_work.py for the logic to put data into my local database and the logic to pull data from the database into a pandas dataframe as shown in the next cell.

In [2]:
full_data = mon.get_episodes()
full_data.sample(3)

Unnamed: 0,ep_num,ep_title,ep_air_date,ep_summary,speaker,words,timestamp,act
4299,683,Beer Summit,"Sept. 20, 2019","Two people, sitting down over a beer, hashing ...",Ben Calhoun,They both told me the story of the Democratic ...,01:03:44_65,Act Two: Lagerheads
8549,672,No Fair!,"Apr. 5, 2019",Stories of very small injustices and also one ...,Gabriel,You have to tell the tattle-phone the whole st...,00:08:48_48,Prologue: Prologue
6343,678,The Wannabes,"July 5, 2019","We hang out with the presidential candidates, ...",David Kestenbaum,"I'm still trying to think through just, like i...",01:01:09_75,Wannabes One


While there is a lot of metadata I kept for each row, I will be working on cleaining up the 'words' column as it contains the actual text of the podcast:

In [8]:
words = full_data['words']

As recommended, we first remove all numbers/puncuations and lower all capitals.

In [9]:
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

words = words.map(alphanumeric).map(punc_lower)
words.sample(5)

916     also  neither deandre joshua nor shawn gray wa...
2062    he was trying to kill me  point blank  what ot...
4205                                                     
4152                  and is that a very common question 
5921                         former governor of colorado 
Name: words, dtype: object

In [None]:
## TO DO: There are words bracketed like [SPEAKING SPANISH]. What do?

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')
transformed_matrix = cv1.fit_transform(words)
word_count_array = pd.DataFrame(transformed_matrix.toarray(), columns=cv1.get_feature_names()).head()

In [16]:
word_count_array.sample(5)


Unnamed: 0,aaron,ab,aback,abandoned,abboud,abbreviated,abc,abdallah,abdul,abdullah,...,zoey,zombies,zone,zones,zoo,zoom,zoomed,zooms,zuckerberg,zukerman
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
client = MongoClient()
db = client.tal_pod

In [5]:
db.list_collection_names()

['episodes']

In [12]:
cursor = db.episodes.find({}, {'ep':1, 'title': 1, '_id':0})
list(cursor)

[{'ep': '688', 'title': 'The Out Crowd'},
 {'ep': '667', 'title': 'Wartime Radio'},
 {'ep': '671', 'title': 'Anything Can Be Anything'},
 {'ep': '684', 'title': 'Burn It Down'},
 {'ep': '685', 'title': 'We Come From Small Places'},
 {'ep': '670', 'title': 'Beware the Jabberwock'},
 {'ep': '689', 'title': 'Digging Up the Bones'},
 {'ep': '666', 'title': 'The Theme That Shall Not Be Named'},
 {'ep': '682', 'title': 'Ten Sessions'},
 {'ep': '677', 'title': 'Seeing Yourself In the Wild'},
 {'ep': '676', 'title': 'Here’s Looking at You, Kid'},
 {'ep': '683', 'title': 'Beer Summit'},
 {'ep': '679', 'title': 'Save the Girl'},
 {'ep': '680', 'title': 'The Weight Of Words'},
 {'ep': '675', 'title': 'I’m on TV??'},
 {'ep': '674', 'title': 'Get a Spine!'},
 {'ep': '681', 'title': 'Escape From the Lab'},
 {'ep': '678', 'title': 'The Wannabes'},
 {'ep': '673', 'title': 'Left Behind'},
 {'ep': '665', 'title': 'Before Things Went to Hell'},
 {'ep': '690', 'title': 'Too Close to Home'},
 {'ep': '669',

[]

In [10]:
db['episodes'].drop()