In [20]:

#####################################            Topic Modelling on all text             ######################################

# import sys
# !{sys.executable} -m pip install gensim

# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

## Importing libraries
import pandas as pd
import pickle
from gensim import matutils, models
import scipy.sparse
from nltk import word_tokenize, pos_tag
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer


## Loading cleaned data in data-frame form from a pickle file
data = pd.read_pickle('data_in_dtm_cleaner.pkl')


## Putting the document-term matrix into a new gensim format; from df --> sparse matrix --> gensim corpus
dtm = data.transpose()
sparse_counts = scipy.sparse.csr_matrix(dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)

## Creating dictionary of all the terms and finding their in the document-term matrix
cv = pickle.load(open("cv_with_new_stop-words.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())


# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
## LDA for num_topics = 2
# lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
# lda.print_topics()

# ##  LDA for num_topics = 3
# lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
# lda.print_topics()

# ##  LDA for num_topics = 4
# lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
# lda.print_topics()

[(0,
  '0.007*"hes" + 0.006*"dad" + 0.005*"mom" + 0.005*"hey" + 0.004*"yeah" + 0.004*"years" + 0.004*"look" + 0.004*"love" + 0.004*"shes" + 0.004*"tell"'),
 (1,
  '0.013*"fucking" + 0.006*"yeah" + 0.005*"ive" + 0.005*"good" + 0.005*"cause" + 0.005*"love" + 0.005*"fuck" + 0.005*"better" + 0.004*"gonna" + 0.004*"didnt"'),
 (2,
  '0.008*"gonna" + 0.007*"uh" + 0.006*"wife" + 0.005*"yeah" + 0.005*"thing" + 0.005*"baby" + 0.005*"hes" + 0.005*"come" + 0.005*"look" + 0.005*"ive"'),
 (3,
  '0.010*"yeah" + 0.010*"shit" + 0.005*"black" + 0.005*"fuck" + 0.004*"come" + 0.004*"white" + 0.004*"look" + 0.004*"guys" + 0.004*"gonna" + 0.004*"world"')]

In [31]:

#####################################            Topic Modelling on nouns only            ######################################

## Loading cleaned data in form of raw transcripts
data_clean = pd.read_pickle('data_clean.pkl')


## Function to tokenize the text and pull out only the nouns
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

## Applying the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.Transcript.apply(nouns))


## Re-adding the additional stop words since the document-term matrix is being recreated
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

## Recreating the document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.Transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index


## Creating the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

## Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())


# ## Applying LDA model with 2 topics
# ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
# ldan.print_topics()
# 
# ## Applying LDA model with 3 topics
# ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
# ldan.print_topics()
# 
# ## Applying LDA model with 4 topics
# ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
# ldan.print_topics()

[(0,
  '0.009*"shes" + 0.008*"dad" + 0.007*"wife" + 0.007*"man" + 0.006*"hes" + 0.006*"home" + 0.006*"house" + 0.006*"shit" + 0.006*"guy" + 0.006*"life"'),
 (1,
  '0.009*"hes" + 0.008*"thing" + 0.008*"wife" + 0.008*"man" + 0.008*"years" + 0.007*"way" + 0.007*"cause" + 0.006*"guy" + 0.006*"life" + 0.006*"lot"'),
 (2,
  '0.011*"man" + 0.009*"shit" + 0.008*"years" + 0.008*"day" + 0.007*"kids" + 0.007*"thing" + 0.007*"fuck" + 0.007*"baby" + 0.006*"way" + 0.005*"phone"')]

In [36]:

#############################             Topic Modelling on adjectives and nouns only            ##############################

## Function to tokenize the text and pull out only the adjectives and nouns
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

## Applying the nouns_adj function to the transcripts to filter only on adjectives and nouns
data_nouns_adj = pd.DataFrame(data_clean.Transcript.apply(nouns_adj))


## Creating a new document-term matrix using only adjectives and nouns, also removing common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.Transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index

## Creating the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

## Creating the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

# ## Applying LDA model with 2 topics
# ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
# ldana.print_topics()
# 
# ## Applying LDA model with 3 topics
# ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
# ldana.print_topics()
# 
# ## Applying LDA model with 4 topics
# ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
# ldana.print_topics()

[(0,
  '0.007*"women" + 0.007*"sex" + 0.005*"jokes" + 0.005*"jen" + 0.005*"bit" + 0.004*"couch" + 0.004*"tension" + 0.004*"opinions" + 0.003*"moment" + 0.003*"joke"'),
 (1,
  '0.010*"dad" + 0.005*"indian" + 0.005*"hasan" + 0.004*"wife" + 0.004*"horse" + 0.003*"brown" + 0.003*"kid" + 0.003*"college" + 0.003*"india" + 0.003*"beef"'),
 (2,
  '0.017*"wife" + 0.008*"uh" + 0.006*"black" + 0.005*"dad" + 0.005*"boy" + 0.005*"trump" + 0.004*"car" + 0.004*"slow" + 0.004*"father" + 0.004*"bar"'),
 (3,
  '0.009*"uh" + 0.005*"snoop" + 0.005*"son" + 0.005*"stuff" + 0.003*"frankie" + 0.003*"car" + 0.003*"person" + 0.003*"special" + 0.003*"eyes" + 0.003*"pussy"')]

In [54]:

#######################################             Final Topic Modelling            ##########################################

## (Changing the num_topics and passes parameters can yield better results)
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=70)
ldana.print_topics()

# Topic 0: (dad, India, movie)
# Topic 1: (wife, family, car)
# Topic 2: (women, sex, jokes)
# Topic 3: (black, trump)

[(0,
  '0.011*"dad" + 0.006*"hasan" + 0.006*"indian" + 0.004*"brown" + 0.004*"beef" + 0.004*"india" + 0.003*"country" + 0.003*"dream" + 0.003*"movie" + 0.003*"bike"'),
 (1,
  '0.014*"wife" + 0.007*"dad" + 0.005*"son" + 0.005*"car" + 0.005*"uh" + 0.004*"snoop" + 0.004*"boy" + 0.004*"kid" + 0.003*"bar" + 0.003*"slow"'),
 (2,
  '0.007*"women" + 0.006*"sex" + 0.005*"jokes" + 0.004*"jen" + 0.004*"bit" + 0.004*"couch" + 0.003*"opinions" + 0.003*"tension" + 0.003*"joke" + 0.003*"moment"'),
 (3,
  '0.017*"uh" + 0.010*"black" + 0.006*"stuff" + 0.004*"eyes" + 0.004*"trump" + 0.004*"vicks" + 0.004*"dan" + 0.004*"grandma" + 0.003*"person" + 0.003*"bit"')]

In [55]:

###############################             Final Topic allocation to the comedians            #################################

## Checking which transcripts cover which topics
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

# Topic 0: (dad, India, movie)   [Hasan Minhaj, Vir Das]
# Topic 1: (wife, family, car)   [Adam Sandler, Gabriel Iglesias, John Mulaney, Sebastian Maniscalco, Seth Meyers]
# Topic 2: (women, sex, jokes)   [Ali Wong, Daniel Sloss, Hannah Gadsby, Mike Birbiglia]
# Topic 3: (black, trump)        [Aziz Ansari, Wanda Sykes]

[(1, 'Adam Sandler'),
 (2, 'Ali Wong'),
 (3, 'Aziz Ansari'),
 (2, 'Daniel Sloss'),
 (1, 'Gabriel “Fluffy” Iglesias'),
 (2, 'Hannah Gadsby'),
 (0, 'Hasan Minhaj'),
 (1, 'John Mulaney'),
 (2, 'Mike Birbiglia'),
 (1, 'Sebastian Maniscalco'),
 (1, 'Seth Meyers'),
 (0, 'Vir Das'),
 (3, 'Wanda Sykes')]