In [73]:
# Import Libraries.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

In [2]:
# Define Functions.
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [3]:
# Import Data.
filename = 'wiki_movie_plots_deduped.csv'
data = pd.read_csv(filename, sep=',',header=0,index_col=False)

In [60]:
# Parameters.
n_features = 2000 # amount of distinct words / vocabulary
n_components = 10 # amount of topics
n_top_words = 10 # highest frequency words in topics

In [67]:
# Term Frequency-Inverse Document Frequency (Matrix Calculation).
samples = data.Plot # text samples

# Convert a collection of raw documents to a matrix of TF-IDF features (samples x features).
# tf-idf: relative frequency of a word in a document weighted by the discriminancy of a word between document classes
tfidf_vectorizer = TfidfVectorizer(max_df=0.30, min_df=2,max_features=n_features,stop_words='english')

# Convert a collection of text documents to a matrix of token counts.
#tf_vectorizer = CountVectorizer(max_df=0.95, min_df=4,max_features=n_features,stop_words='english')

# Fit-Transform.
tfidf = tfidf_vectorizer.fit_transform(samples) # samples x features 
# tf = tf_vectorizer.fit_transform(samples) # samples x features 

In [69]:
# Non-negative Matrix Factorization.
nmf =  NMF(n_components=n_components, random_state=1,beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.4,l1_ratio=.5)

# Fit NMF.
nmf.fit(tfidf)

# Print Top Words for Topics.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0: years way tells young finds soon home night away takes
Topic #1: son father family young years story wife mother daughter village
Topic #2: work year york new job works home relationship old wants
Topic #3: film story world young set people movie new years plot
Topic #4: killed kill escape kills wounded men help killing using officer
Topic #5: police murder working crime gang money man officer killed story
Topic #6: war town small men new world joe young american man
Topic #7: wife woman young man husband wealthy son john jack murder
Topic #8: school year team friends student way high win teacher students
Topic #9: wants turns wins king meets takes win wrong falls wedding



In [74]:
# Singular Value Decomposition.
svd = TruncatedSVD(n_components=n_components)

# Fit PCA.
svd.fit(tfidf)

# Print Top Words for Topics.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(svd, tfidf_feature_names, n_top_words)

Topic #0: father family man mother film police son house wife home
Topic #1: family father village son mother marriage daughter story married raja
Topic #2: tom jerry cat joe mary house mother home family father
Topic #3: film tom story village war jerry king world japanese british
Topic #4: film school new jack joe relationship mary young year family
Topic #5: film police story gang murder money inspector crime case movie
Topic #6: joe village harry war villagers family town max jimmy son
Topic #7: jack joe wife king film mary son family husband man
Topic #8: jack raja joe school village team friends students college group
Topic #9: family father war son brother police officer mother japanese gang



In [68]:
# Latent Dirichlet Allocation.
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,learning_method='online',learning_offset=50,random_state=1)

# Fit LDA.
lda.fit(tfidf)

# Print Top Words for Topics.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda, tfidf_feature_names, n_top_words)

Topic #0: charlie hong kong ramesh chan tiger sandy wong emotional following
Topic #1: hari julie annie lily ellen andrew mafia prasad jennifer sandy
Topic #2: father family son man mother film police story daughter wife
Topic #3: linda christine diane craig bugs fbi karl wakes frank betty
Topic #4: shankar film devi ashok karthik surya jai prem mafia pooja
Topic #5: raju anna chris vicky kate wong jesse duke prem falls
Topic #6: jane lisa kitty ken jonathan tribe lieutenant ho tracks disappears
Topic #7: raja arjun ajay jenny mark khan rick emma wolf carl
Topic #8: maggie jeff luke marcus harold dorothy witch mafia harry karan
Topic #9: prakash arun madhu gopal jo amy laura bugs josh prem

