In this notebook we'll explore topic modeling to discover broad themes in a collection of movie summaries.

In [59]:
import nltk
import re
import gensim
from gensim import corpora
import operator

nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import random

random.seed(1)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mingyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

Since we're running topic modeling on texts with lots of names, we'll add the Jockers list of stopwords (which includes character names) to our stoplist.

In [61]:
stop_words = {k:1 for k in stopwords.words('english')}
stop_words.update(read_stopwords("../data/jockers.stopwords"))
stop_words["'s"]=1
stop_words=list(stop_words.keys())

In [62]:
def filter(word, stopwords):
    
    """ Function to exclude words from a text """
    
    # no stopwords
    if word in stopwords:
        return False
    
    # has to contain at least one letter
    if re.search("[A-Za-z]", word) is not None:
        return True
    
    return False

In [63]:
def read_docs(plotFile, metadataFile, stopwords):
    
    names={}
    box={}
    
    with open(metadataFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            name=cols[2]
            boxoffice=cols[4]
            if len(boxoffice) != 0:
                box[idd]=int(boxoffice)
                names[idd]=name
    
    n=5000
    target_movies={}


    sorted_box = sorted(box.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_box[:n]:
        target_movies[k]=names[k]
    
    docs=[]
    names=[]
   
    with open(plotFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            text=cols[1]
            
            if idd in target_movies:
                tokens=nltk.word_tokenize(text.lower())
                tokens=[x for x in tokens if filter(x, stopwords)]
                docs.append(tokens)
                name=target_movies[idd]
                names.append(name)
    return docs, names

We'll read in summaries of the 5,000 movies with the highest box office revenues.

In [64]:
metadataFile="../data/movie.metadata.tsv"
plotFile="../data/plot_summaries.txt"
data, doc_names=read_docs(plotFile, metadataFile, stop_words)

In [18]:
len(doc_names)

4778

We will convert the movie summaries into a bag-of-words representation using gensim's [corpora.dictionary](https://radimrehurek.com/gensim/corpora/dictionary.html) methods.

In [11]:
# Create vocab from data; restrict vocab to only the top 10K terms that show up in at least 5 documents 
# and no more than 50% of all documents

dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

In [19]:
# Replace dataset with numeric ids words in vocab (and exclude all other words)
# bag-of-words format: token ids and token counts
corpus = [dictionary.doc2bow(text) for text in data]

In [20]:
len(corpus)

4778

In [24]:
corpus[2][:20]

[(17, 1),
 (32, 1),
 (36, 1),
 (38, 1),
 (42, 2),
 (50, 1),
 (55, 1),
 (57, 1),
 (59, 1),
 (76, 1),
 (80, 2),
 (86, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 5),
 (121, 9),
 (124, 3),
 (131, 1),
 (143, 1)]

In [54]:
num_topics=5

Now let's run a topic model on this data using gensim's built-in LDA.

In [55]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

We can get a sense of what the topics are by printing the top 10 words with highest $P(word \mid topic)$ for each topic

In [56]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=3)])))

topic 0:	will war men
topic 1:	police car killed
topic 2:	house michael tells
topic 3:	find off through
topic 4:	new will father


In [57]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	will war men ; this army ' father during all
topic 1:	police car killed been man kill money frank had ;
topic 2:	house michael tells home father night goes finds david room
topic 3:	find off through can escape earth its dr. ; will
topic 4:	new will father love film this tells life time mother


Another way of understanding topics is to print out the documents that have the highest topic representation -- i.e., for a given topic $k$, the documents with highest $P(topic=k | document)$.  How much do the documents listed here align with your understanding of the topics?

In [34]:
topic_model.show_topic(0, topn=10)

[('david', 0.023339266),
 ('jimmy', 0.013298975),
 ('band', 0.013000627),
 ('bobby', 0.012748592),
 ('terry', 0.010856751),
 ('amy', 0.007921143),
 ('show', 0.0073583373),
 ('emma', 0.0071322545),
 ('[', 0.0071055167),
 ('club', 0.00558657)]

In [35]:
topic_model.get_document_topics(corpus[0])

[(1, 0.3034238),
 (7, 0.11577231),
 (8, 0.06235964),
 (12, 0.17282276),
 (13, 0.2732274),
 (17, 0.07075198)]

In [58]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v,doc_names[k]))
    print()
    
    

will war men ; this army ' father during all

0	0.998	The Last Legion
0	0.998	Kingdom of Heaven
0	0.998	The Bridge on the River Kwai
0	0.998	Centurion
0	0.998	Welcome to Dongmakgol

police car killed been man kill money frank had ;

1	0.999	The Taking of Pelham 1 2 3
1	0.999	Exit Wounds
1	0.998	Ransom
1	0.998	U.S. Marshals
1	0.998	Street Kings

house michael tells home father night goes finds david room

2	0.999	Deck the Halls
2	0.999	Psycho II
2	0.999	The Rage: Carrie 2
2	0.999	Shutter
2	0.998	The Ring

find off through can escape earth its dr. ; will

3	0.998	Small Soldiers
3	0.998	9
3	0.998	The Thing
3	0.998	Alien
3	0.998	The Iron Giant

new will father love film this tells life time mother

4	0.999	Nine
4	0.999	Diary of a Wimpy Kid: Dog Days
4	0.999	Bridesmaids
4	0.999	The Help
4	0.999	The Way Home



In [53]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v,doc_names[k]))
    print()
    
    

steve elizabeth linda simon graham buddy vampire richie jonathan sophie

0	0.809	Bats
0	0.625	Zathura
0	0.620	Cirque du Freak: The Vampire's Assistant
0	0.595	Deck the Halls
0	0.472	Hollow Man

michael sarah matt adam kyle beth roger april jesus sara

1	0.629	The Passion of the Christ
1	0.601	He Got Game
1	0.563	Click
1	0.541	Jesus Christ Superstar
1	0.517	A Good Day to Have an Affair

ray chris johnny team alien gordon through nuclear dr. earth

2	0.894	Alien: Resurrection
2	0.781	Beneath the Planet of the Apes
2	0.778	Queen of Blood
2	0.686	Firefox
2	0.668	The Core

school tells new will go does love get day this

3	0.958	A Chorus Line
3	0.955	Booty Call
3	0.949	My Wife Got Married
3	0.934	Easy A
3	0.927	Fresh Horses

money james will father don santa christmas time lee jerry

4	0.837	Rollover
4	0.757	Wall Street: Money Never Sleeps
4	0.735	Two If by Sea
4	0.716	Eye for an Eye
4	0.695	Santa Claus: The Movie

mary peter jane larry brian ash william jean catherine zack

5	0.729	Night a