In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline




In [2]:
df = pd.read_csv('movie_train.csv', index_col=0)

In [5]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(text):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(text)
    
#     mytokens = [word for word in mytokens if word.pos_ != "PROPN"]
    
#     mytokens = [ word if word.pos_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [6]:
tfidf = TfidfVectorizer(tokenizer=spacy_tokenizer,sublinear_tf=True, min_df=20, ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.Plot).toarray()
labels = df.Genre
features.shape

(10682, 9211)

In [23]:
from sklearn.decomposition import LatentDirichletAllocation
# Build LDA Model
lda_model = LatentDirichletAllocation(n_topics=9,               # Number of topics
                                      max_iter=40,               # Max learning iterations
                                      random_state=100,          # Random state
                                      batch_size=250,            # n docs in each learning iter
                                      n_jobs = -1,               # Use all available CPUs
                                     )

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=250, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=40, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=9, perp_tol=0.1,
             random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [24]:
lda_output = lda_model.fit_transform(features)




In [11]:
lda_output

array([[0.01060562, 0.01060562, 0.01060562, ..., 0.01060562, 0.01060562,
        0.01060562],
       [0.00573136, 0.00573136, 0.00573136, ..., 0.00573136, 0.00573136,
        0.00573136],
       [0.00653453, 0.00653453, 0.00653453, ..., 0.00653453, 0.00653453,
        0.00653453],
       ...,
       [0.00465427, 0.00465427, 0.00465427, ..., 0.00465427, 0.00465427,
        0.00465427],
       [0.00594592, 0.00594592, 0.00594592, ..., 0.00594592, 0.00594592,
        0.00594592],
       [0.00665468, 0.00665468, 0.00665468, ..., 0.00665468, 0.00665468,
        0.00665468]])

In [25]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['rookie', 'film base', 'virtue', 'play lead', '11-year old', 'lead role', '11-year', 'film', 'present', 'minute', 'following', '30', 'length', 'segment', 'average']


THE TOP 15 WORDS FOR TOPIC #1
['film base', 'virtue', 'film', 'play lead', '11-year old', 'lead role', '11-year', 'black', 'blackmail', 'cousin', 'wind', 'surgeon', 'adventure', 'musician', 'cheat']


THE TOP 15 WORDS FOR TOPIC #2
['film base', 'virtue', 'film', 'play lead', '11-year old', 'lead role', '11-year', 'fall', 'rescue', 'gangster', 'outlaw', 'alcoholic', 'instant', 'heiress', 'spiritual']


THE TOP 15 WORDS FOR TOPIC #3
['time', 'meet', 'return', 'film', 'come', 'family', 'try', 'father', 'friend', 'tell', 'life', 'leave', 'kill', 'love', 'man']


THE TOP 15 WORDS FOR TOPIC #4
['break', 'story', 'people', 'graveyard', 'protector', 'dual', 'cattle', 'play', 'sidekick', 'loving', 'pair', 'play role', 'role', '11-year old', '11-year']


THE TOP 15 WORDS FOR TOPIC #5
['fictional', 'he

In [18]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=15):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords



In [20]:
import numpy as np

In [26]:
topic_keywords = show_topics(tfidf, lda_model)        


In [27]:

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,average,segment,length,30,following,minute,present,film,11-year,lead role,11-year old,play lead,virtue,film base,rookie
Topic 1,cheat,musician,adventure,surgeon,wind,cousin,blackmail,black,11-year,lead role,11-year old,play lead,film,virtue,film base
Topic 2,spiritual,heiress,instant,alcoholic,outlaw,gangster,rescue,fall,11-year,lead role,11-year old,play lead,film,virtue,film base
Topic 3,man,love,kill,leave,life,tell,friend,father,try,family,come,film,return,meet,time
Topic 4,11-year,11-year old,role,play role,pair,loving,sidekick,play,cattle,dual,protector,graveyard,people,story,break
Topic 5,british,backdrop,caravan,controversial,undertake,curtain,fugitive,charm,set,architect,film set,working,iron,height,fictional
Topic 6,baron,element,primary,gunman,corrupt,romance,music,land,film,11-year,lead role,11-year old,play lead,virtue,film base
Topic 7,remake,novel,event lead,ballet,chronicle,inventor,killing,era,noble,princess,spanish,tow,dancer,escort,event
Topic 8,trader,slave,fight,11-year,lead role,11-year old,play lead,film,virtue,film base,rookie,film story,affair married,son play,story follow


In [None]:
import numpy as np

# Create Document - Topic Matrix
# lda_output = lda_model.transform(dtm)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(npr))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'red' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.tail(15).style.applymap(color_green).applymap(make_bold)
df_document_topics