In [87]:
# Let's read in our document-term matrix
import pandas as pd
import pickle
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models, corpora
import scipy.sparse
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize, pos_tag

dtm = pd.read_pickle('data_files/comedian_dtm')
raw_data = pd.read_pickle('data_files/comedian_corpus')
my_stop_words = pd.read_pickle('data_files/stop_words')

In [69]:
dtm

Unnamed: 0,Bo Burnham - Words Words Words,Bo Burnham - What,Bo Burnham - Make Happy,John Mulaney - New in Town,John Mulaney - Comeback Kid,John Mulaney - Kid Gorgeous,Ricky Gervais - Politics,Ricky Gervais - Science,Ricky Gervais - Humanity,Kevin Bridges - Story So Far,Kevin Bridges - The Story Continues,Kevin Bridges - Whole Different Story,Aziz Ansari - Intimate Moments Sensual Evening,Aziz Ansari - Madison Square Garden,Aziz Ansari - Right Now
aaaaahhhhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaahhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaand,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
aaand,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoologist,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
zs,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
zuckerberg,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0
zuki,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [83]:
dtm.loc['yeahhhhhhhhhhhhh']

Bo Burnham - Words Words Words                    0
Bo Burnham - What                                 0
Bo Burnham - Make Happy                           0
John Mulaney - New in Town                        1
John Mulaney - Comeback Kid                       0
John Mulaney - Kid Gorgeous                       0
Ricky Gervais - Politics                          0
Ricky Gervais - Science                           0
Ricky Gervais - Humanity                          0
Kevin Bridges - Story So Far                      0
Kevin Bridges - The Story Continues               0
Kevin Bridges - Whole Different Story             0
Aziz Ansari - Intimate Moments Sensual Evening    0
Aziz Ansari - Madison Square Garden               0
Aziz Ansari - Right Now                           0
Name: yeahhhhhhhhhhhhh, dtype: int64

# Attempt 1

In [70]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [103]:
cv = CountVectorizer(stop_words = my_stop_words)
data_cv = cv.fit(raw_data.sketch_text)
id2word = dict((v, k) for k, v in data_cv.vocabulary_.items())

In [79]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word = id2word, num_topics=2, passes=20)
lda.print_topics()

[(0,
  '0.006*"ugh" + 0.006*"oj" + 0.005*"wanking" + 0.005*"lounge" + 0.004*"whatll" + 0.004*"study" + 0.004*"funny" + 0.004*"mammals" + 0.004*"wave" + 0.004*"hey"'),
 (1,
  '0.007*"going" + 0.007*"fucking" + 0.006*"guy" + 0.006*"welfare" + 0.005*"theyd" + 0.004*"mammals" + 0.004*"okayyyyy" + 0.004*"come" + 0.004*"bit" + 0.004*"good"')]

In [80]:
lda = models.LdaModel(corpus=corpus, id2word = id2word, num_topics=4, passes=20)
lda.print_topics()

[(0,
  '0.005*"oj" + 0.005*"theyd" + 0.005*"going" + 0.005*"fucking" + 0.005*"wanking" + 0.004*"god" + 0.004*"good" + 0.004*"lounge" + 0.004*"welfare" + 0.004*"okayyyyy"'),
 (1,
  '0.006*"going" + 0.005*"guy" + 0.005*"horse" + 0.005*"yeahhhhhhhhhhhhh" + 0.005*"widow" + 0.004*"okayyyyy" + 0.004*"oj" + 0.004*"yeahhh" + 0.004*"mon" + 0.003*"theyd"'),
 (2,
  '0.010*"welfare" + 0.005*"going" + 0.005*"wanking" + 0.004*"usain" + 0.004*"come" + 0.004*"thoroughly" + 0.003*"okayyyyy" + 0.003*"fucking" + 0.003*"good" + 0.003*"il"'),
 (3,
  '0.010*"guy" + 0.009*"fucking" + 0.007*"going" + 0.007*"mammals" + 0.005*"fuck" + 0.005*"ugh" + 0.005*"guys" + 0.005*"bit" + 0.005*"shirt" + 0.004*"shirts"')]

In [81]:
lda = models.LdaModel(corpus=corpus, id2word = id2word, num_topics=6, passes=20)
lda.print_topics()

[(0,
  '0.005*"going" + 0.005*"okayyyyy" + 0.005*"wanking" + 0.005*"lounge" + 0.005*"yeahhhhhhhhhhhhh" + 0.004*"nevermind" + 0.004*"study" + 0.004*"thighsas" + 0.004*"mammals" + 0.004*"hey"'),
 (1,
  '0.011*"fucking" + 0.010*"guy" + 0.009*"going" + 0.006*"bit" + 0.006*"match" + 0.006*"welfare" + 0.005*"good" + 0.004*"shirts" + 0.004*"come" + 0.004*"fuck"'),
 (2,
  '0.007*"going" + 0.007*"fucking" + 0.007*"welfare" + 0.007*"theyd" + 0.006*"oj" + 0.005*"okayyyyy" + 0.005*"didnt" + 0.005*"god" + 0.005*"meal" + 0.005*"big"'),
 (3,
  '0.012*"ugh" + 0.008*"oj" + 0.007*"wanking" + 0.006*"whistles" + 0.005*"wave" + 0.005*"guy" + 0.005*"shirt" + 0.004*"guys" + 0.004*"woudnt" + 0.004*"majorca"'),
 (4,
  '0.008*"mammals" + 0.008*"hey" + 0.006*"whatll" + 0.006*"texting" + 0.005*"guy" + 0.005*"oj" + 0.005*"funny" + 0.005*"shirt" + 0.005*"god" + 0.004*"il"'),
 (5,
  '0.000*"fucking" + 0.000*"theyd" + 0.000*"going" + 0.000*"guy" + 0.000*"god" + 0.000*"bit" + 0.000*"welfare" + 0.000*"mammals" + 0.000*

# Attempt 2 - Nouns Only

In [84]:
# Let's create a function to pull out nouns from a string of text
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [90]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(raw_data.sketch_text.apply(nouns))
data_nouns

Unnamed: 0,sketch_text
Bo Burnham - Words Words Words,thank i hey hey ho thats hitler power show bit...
Bo Burnham - What,macdonald farm e i o farm pig e i i snort macd...
Bo Burnham - Make Happy,thing mother person life i mother person uh uh...
John Mulaney - New in Town,town john town hes mustard shirt hes papers he...
John Mulaney - Comeback Kid,charm wit snl writer john mulaney marriage bee...
John Mulaney - Kid Gorgeous,welcome city music time questions walk hi im j...
Ricky Gervais - Politics,houses parliament home politics politics polit...
Ricky Gervais - Science,voice laboratory sort frankensteins doctors bo...
Ricky Gervais - Humanity,hello thank fuck thank im gon youre weve money...
Kevin Bridges - Story So Far,im crossroads lines moon time walking devil i ...


In [123]:
cvn = CountVectorizer(stop_words = my_stop_words)
data_dict_cvn = cvn.fit(data_nouns.sketch_text)
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())
data_dtm_cvn = cvn.fit_transform(data_nouns.sketch_text)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
dtmn = data_dtmn.transpose()

In [128]:
sparse_counts = scipy.sparse.csr_matrix(dtmn)
corpusn = matutils.Sparse2Corpus(sparse_counts)

In [129]:
ldan = models.LdaModel(corpus=corpusn, id2word = id2wordn, num_topics=2, passes=20)
ldan.print_topics()

[(0,
  '0.013*"guy" + 0.009*"man" + 0.007*"bit" + 0.006*"way" + 0.006*"day" + 0.005*"stuff" + 0.005*"life" + 0.005*"god" + 0.005*"guys" + 0.005*"kind"'),
 (1,
  '0.007*"day" + 0.007*"years" + 0.005*"man" + 0.005*"bit" + 0.005*"things" + 0.005*"way" + 0.005*"night" + 0.005*"guy" + 0.004*"year" + 0.004*"home"')]

In [130]:
ldan = models.LdaModel(corpus=corpusn, id2word = id2wordn, num_topics=4, passes=20)
ldan.print_topics()

[(0,
  '0.013*"guy" + 0.007*"bit" + 0.006*"guys" + 0.005*"man" + 0.005*"day" + 0.005*"way" + 0.005*"club" + 0.005*"song" + 0.005*"life" + 0.004*"da"'),
 (1,
  '0.013*"guy" + 0.011*"man" + 0.007*"day" + 0.007*"bit" + 0.006*"hey" + 0.006*"night" + 0.006*"way" + 0.005*"dad" + 0.005*"kind" + 0.005*"years"'),
 (2,
  '0.009*"years" + 0.007*"day" + 0.007*"way" + 0.007*"things" + 0.007*"bit" + 0.006*"guy" + 0.006*"theyre" + 0.006*"man" + 0.006*"okay" + 0.006*"god"'),
 (3,
  '0.011*"stuff" + 0.011*"bo" + 0.009*"repeat" + 0.008*"eye" + 0.008*"contact" + 0.006*"man" + 0.006*"brain" + 0.005*"story" + 0.005*"cos" + 0.004*"comedy"')]

In [131]:
ldan = models.LdaModel(corpus=corpusn, id2word = id2wordn, num_topics=6, passes=20)
ldan.print_topics()

[(0,
  '0.015*"guy" + 0.007*"man" + 0.007*"day" + 0.007*"kind" + 0.006*"years" + 0.006*"lot" + 0.005*"things" + 0.005*"bit" + 0.005*"school" + 0.005*"night"'),
 (1,
  '0.014*"sort" + 0.011*"okay" + 0.010*"bit" + 0.009*"god" + 0.008*"theyre" + 0.006*"shes" + 0.005*"fuck" + 0.005*"way" + 0.005*"fucking" + 0.004*"man"'),
 (2,
  '0.015*"guy" + 0.009*"stuff" + 0.009*"bit" + 0.008*"guys" + 0.007*"man" + 0.007*"club" + 0.007*"bo" + 0.006*"da" + 0.006*"repeat" + 0.005*"eye"'),
 (3,
  '0.011*"guy" + 0.010*"way" + 0.009*"lets" + 0.009*"man" + 0.008*"hey" + 0.008*"stuff" + 0.007*"kind" + 0.007*"person" + 0.007*"god" + 0.007*"day"'),
 (4,
  '0.012*"man" + 0.010*"guy" + 0.008*"day" + 0.007*"way" + 0.006*"hey" + 0.006*"whats" + 0.006*"life" + 0.006*"wife" + 0.005*"years" + 0.005*"things"'),
 (5,
  '0.010*"bit" + 0.007*"years" + 0.007*"mate" + 0.006*"guy" + 0.006*"day" + 0.006*"night" + 0.005*"man" + 0.005*"year" + 0.005*"home" + 0.005*"joke"')]

Neither of these options are currently giving coherent topics, with two main issues:
- The topic is picking up a word that isn't mentioned that often such as 'yeahhhhhhh'
- The majority of the words in our topic are still too generic 

To solve these problems, I will:
- Remove any words from my corpus that only appear once, as they add minimal input to my topics but would take a lot of time to clean
- Possibly add more words to my stopwords list to give my topics more distinguishable keywords