## Setup

In [1]:
import pandas as pd
import mongo_work  as mon
from spacy.lang.en import English

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Can run into memory issues if you make this too long. This is for the spaCy code.
nlp.max_length = 3388383

In [3]:
# Run the below command to run spaCy related code
# ```
# python -m spacy download en_core_web_sm
# ```
# The other dependencies for spaCy should already be installed for you if you run the GitHub bash script.

## Project text

I will be analyzing transcripts from the _This American Life podcasts_. All the code for getting the data is in 'scraping.py' for review. I will start from the text preprossing part of the project. 

## Getting the data from Mongo

The data is stored in a local mongo database. You can review the mongo_work.py for the logic to put data into my local database and the logic to pull data from the database into a pandas dataframe as shown in the next cell.

Due to memory issues, I have split up the data by year and saved the data in pandas. Below is the code used to review and save the data as new csvs

In [57]:
# Get full data (skip straight to document cleaning steps)
full_data = mon.get_episodes()

# Data type cleaning required for parsing for years and sorting
full_data['ep_air_date'] =  pd.to_datetime(full_data['ep_air_date'])
full_data['ep_num'] =  pd.to_numeric(full_data['ep_num'])
full_data['timestamp'] =  full_data['timestamp'].str.replace('_', '.')
full_data['timestamp'] =  pd.to_timedelta(full_data['timestamp'])
full_data = full_data.sort_values(by=['ep_num', 'timestamp'])


no_credits = full_data[full_data['act']!='Credits']

no_credits_no_sounds = no_credits[no_credits['words'].str.match("(?!.*\]\.*$).*")]

no_credits_sounds_blanks = no_credits_no_sounds[no_credits_no_sounds['words']!='']

test = no_credits_sounds_blanks.groupby(by='ep_num').sum()['words']

Columns I'll used later for further analysis is here:

In [19]:
episodes = full_data[['ep_num']].groupby(by='ep_num').sum()
eps = episodes.index.values

In [20]:
ep_meta = full_data[['ep_num', 
                      'ep_air_date', 
                      'ep_summary']].groupby(by=['ep_num', 
                                                 'ep_air_date',
                                                 'ep_summary']).sum().reset_index()

## Data cleaning

### Reduce the words down to their stems

This has to be done ahead of removing numbers because the lemm_words process adds digits back in.

In [28]:
lemm_words = []
counter = 1

for act in words:
    print(f'Lemming {counter} of {len(words)} items.')
    l = []
    tok_text = nlp(act)
    for token in tok_text:
        l.append(token.lemma_)
    lemm_words.append(' '.join(l))
    counter +=1

Lemming 1 of 688 items.
Lemming 2 of 688 items.
Lemming 3 of 688 items.
Lemming 4 of 688 items.
Lemming 5 of 688 items.
Lemming 6 of 688 items.
Lemming 7 of 688 items.
Lemming 8 of 688 items.
Lemming 9 of 688 items.
Lemming 10 of 688 items.
Lemming 11 of 688 items.
Lemming 12 of 688 items.
Lemming 13 of 688 items.
Lemming 14 of 688 items.
Lemming 15 of 688 items.
Lemming 16 of 688 items.
Lemming 17 of 688 items.
Lemming 18 of 688 items.
Lemming 19 of 688 items.
Lemming 20 of 688 items.
Lemming 21 of 688 items.
Lemming 22 of 688 items.
Lemming 23 of 688 items.
Lemming 24 of 688 items.
Lemming 25 of 688 items.
Lemming 26 of 688 items.
Lemming 27 of 688 items.
Lemming 28 of 688 items.
Lemming 29 of 688 items.
Lemming 30 of 688 items.
Lemming 31 of 688 items.
Lemming 32 of 688 items.
Lemming 33 of 688 items.
Lemming 34 of 688 items.
Lemming 35 of 688 items.
Lemming 36 of 688 items.
Lemming 37 of 688 items.
Lemming 38 of 688 items.
Lemming 39 of 688 items.
Lemming 40 of 688 items.
Lemming 4

Lemming 321 of 688 items.
Lemming 322 of 688 items.
Lemming 323 of 688 items.
Lemming 324 of 688 items.
Lemming 325 of 688 items.
Lemming 326 of 688 items.
Lemming 327 of 688 items.
Lemming 328 of 688 items.
Lemming 329 of 688 items.
Lemming 330 of 688 items.
Lemming 331 of 688 items.
Lemming 332 of 688 items.
Lemming 333 of 688 items.
Lemming 334 of 688 items.
Lemming 335 of 688 items.
Lemming 336 of 688 items.
Lemming 337 of 688 items.
Lemming 338 of 688 items.
Lemming 339 of 688 items.
Lemming 340 of 688 items.
Lemming 341 of 688 items.
Lemming 342 of 688 items.
Lemming 343 of 688 items.
Lemming 344 of 688 items.
Lemming 345 of 688 items.
Lemming 346 of 688 items.
Lemming 347 of 688 items.
Lemming 348 of 688 items.
Lemming 349 of 688 items.
Lemming 350 of 688 items.
Lemming 351 of 688 items.
Lemming 352 of 688 items.
Lemming 353 of 688 items.
Lemming 354 of 688 items.
Lemming 355 of 688 items.
Lemming 356 of 688 items.
Lemming 357 of 688 items.
Lemming 358 of 688 items.
Lemming 359 

Lemming 637 of 688 items.
Lemming 638 of 688 items.
Lemming 639 of 688 items.
Lemming 640 of 688 items.
Lemming 641 of 688 items.
Lemming 642 of 688 items.
Lemming 643 of 688 items.
Lemming 644 of 688 items.
Lemming 645 of 688 items.
Lemming 646 of 688 items.
Lemming 647 of 688 items.
Lemming 648 of 688 items.
Lemming 649 of 688 items.
Lemming 650 of 688 items.
Lemming 651 of 688 items.
Lemming 652 of 688 items.
Lemming 653 of 688 items.
Lemming 654 of 688 items.
Lemming 655 of 688 items.
Lemming 656 of 688 items.
Lemming 657 of 688 items.
Lemming 658 of 688 items.
Lemming 659 of 688 items.
Lemming 660 of 688 items.
Lemming 661 of 688 items.
Lemming 662 of 688 items.
Lemming 663 of 688 items.
Lemming 664 of 688 items.
Lemming 665 of 688 items.
Lemming 666 of 688 items.
Lemming 667 of 688 items.
Lemming 668 of 688 items.
Lemming 669 of 688 items.
Lemming 670 of 688 items.
Lemming 671 of 688 items.
Lemming 672 of 688 items.
Lemming 673 of 688 items.
Lemming 674 of 688 items.
Lemming 675 

In [29]:
words = pd.Series(lemm_words)
words.sample(3)

582    from wbez chicago , -pron- be this american li...
330    so in 1853 , during the california gold rush ,...
462    oscar be 31 , work two job with three little k...
dtype: object

### Remove all numbers/punctuation and lower all capitals.

In [30]:
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

words = words.map(alphanumeric).map(punc_lower)
words.sample(5)

663    when dave be turn  pron  life around and try d...
279    ok   here be a joke that  pron  serviceman in ...
569    julia  s     jane and ella be     and when  pr...
433    here be what  pron  be like to hold political ...
387    from wbez chicago    pron  be this american li...
dtype: object

In [45]:
# import pickle

# with open('words.pickle', 'wb') as to_write:
#     pickle.dump(dtm, to_write)

### Remove stop words

Now we remove all stop words and convert to document term matrix

In [31]:
from sklearn.feature_extraction import text 

# add to the typical list of English stop words
# 'PRON' is added by lemma logic for pronouns
stop_words = text.ENGLISH_STOP_WORDS.union([
                                            'just', 'like', 'say', 'pron', 'oh', 'okay', 
                                            'yeah', 'really', 'actually', 'life', 'uh', 
                                            'ira', 'theme', 'ask', 'ok', 'glass', 'paul',
                                            'alex', 'mr', 'chicken', 'david', 'soon',
                                            'john', 'sarah', 'mike', 'mark', 'michael',
                                            'totally'])

### Count vectorizer (and removing stop words)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words=stop_words, max_df=.8, min_df=.2, ngram_range=(1,2))
dtm = cv1.fit_transform(words)

In [34]:
vectorizer = cv1

In [190]:
pd.DataFrame(dtm.toarray(), 
             columns=vectorizer.get_feature_names()).head(10)

Unnamed: 0,abandon,ability,absolutely,accept,accident,accord,account,accuse,acknowledge,act act,...,ye,year ago,year later,year year,yell,yellow,yesterday,york,york city,young man
0,0,0,1,0,0,1,0,0,0,0,...,0,3,1,0,0,0,0,2,0,0
1,0,0,2,5,0,0,7,1,0,0,...,0,0,0,0,1,0,1,4,1,0
2,0,0,0,0,0,1,1,0,0,0,...,9,0,1,0,2,0,1,2,2,0
3,1,0,1,1,0,0,0,0,0,0,...,0,1,1,0,1,0,1,1,0,2
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,2,0,5
5,0,0,0,0,0,1,0,0,0,0,...,1,1,1,0,1,0,0,0,0,0
6,1,0,1,0,0,1,0,0,0,0,...,2,0,0,0,3,2,0,5,1,0
7,0,1,1,1,0,2,0,0,0,0,...,3,1,1,5,0,0,2,2,0,1
8,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,6,2,1,0,0,0
9,0,0,2,3,1,0,1,0,1,1,...,2,0,0,0,1,0,0,1,0,0


## Latent Dirichlet Allocation  (LDA)

In [11]:
num_of_topics = 15

In [12]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [35]:
from sklearn.decomposition import LatentDirichletAllocation

# doc_topic_prior is alpha in our notes (per doc topic distribution):
# high: each doc has lots of topics
# low: each doc has few topics

# topic_word_prior is beta in our notes (per topic word distribution):
# high: each topic has lots of words
# low: each topic has few words

lda = LatentDirichletAllocation(num_of_topics, max_iter=50, 
                                topic_word_prior=.8, doc_topic_prior=.8)
doc_topic = lda.fit_transform(dtm)

In [38]:
# import pickle

# with open('lda.pickle', 'wb') as to_write:
#     pickle.dump(dtm, to_write)

# with open('lda.pickle','rb') as read_file:
#     words = pickle.load(read_file)

In [37]:
display_topics(lda, 
               vectorizer.get_feature_names(), 
               20)


Topic  0
church, christian, group, pray, jesus, faith, lord, wall, religious, cross, hell, member, spirit, town, wife, brother, enemy, war, father, experience

Topic  1
company, buy, bank, sell, business, worker, government, million, market, dollar, plant, store, price, plan, customer, create, manager, office, credit, sale

Topic  2
police, war, kill, gun, officer, shoot, cop, soldier, military, force, fight, report, murder, army, arrest, death, crime, attack, weapon, bomb

Topic  3
girl, mother, bed, light, sleep, wear, boy, hair, voice, floor, baby, window, throw, body, table, arm, white, apartment, drink, heart

Topic  4
letter, sex, gay, marry, marriage, relationship, don, mail, advice, wife, straight, wedding, sexual, kiss, interview, computer, boyfriend, date, girl, feeling

Topic  5
black, white, student, teacher, class, high school, college, parent, girl, neighborhood, teach, south, grade, group, education, race, boy, board, community, middle

Topic  6
president, republican, v

In [39]:
lda_results = pd.DataFrame(eps,
             columns=['episode']).join(pd.DataFrame(doc_topic))

df = ep_meta.merge(lda_results, how='inner', left_on='ep_num', right_on='episode')

In [42]:
df.columns = ['ep_num', 'air_date', 'summary', 'ep', 
              'religion', 'business', 'military', 'home', 'relationships', 'race & education',
              'politics', 'media', 'other', 'sports', 'law & order', 'medical', 'family', 
              'holidays', 'travel']

In [44]:
#df.to_csv('dtm_data.csv')