In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from itertools import groupby

from gensim.utils import tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.models import TfidfModel
from gensim import corpora
import gensim
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Set Display setting for columns
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300
pd.options.display.max_colwidth = -1
pd.options.display.width = 800
pd.options.display.max_seq_items = 2000
pd.options.display.float_format = "{:.2f}".format

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from pprint import pprint

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liawany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/liawany/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/liawany/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
input_movies = pd.read_csv('/Users/liawany/Documents/git/NLP_competition/input_movies.csv')

In [5]:
input_movies.shape

(6503, 2)

In [79]:
input_movies.tail(3)

Unnamed: 0,ids,abstract
6500,doc_6500,"Venu (Venu Thottempudi) is a receptionist at a lowly beachside hotel in Vizag. He has a beautiful neighbor Nirmala (Ashima Bhalla). Her father's (Mouli) financial status is in bad shape and Venu helps them monetarily and morally. In the process Nirmala falls in love with Venu and Nirmala's family approves their relationship. Later on a well-off guy, Ramji, starts helping Nirmala and family out with luxurious items and better living conditions. Nirmala's father falling for this windfall, forces Nirmala to accept Ramji's marriage proposal by dumping Venu.\r\nLater, Venu meets another girl Radha (Abhirami) who starts having feelings for him and she also monetarily helps him out. Venu is also aware of this. By then, Nirmala gets cheated and dumped by Ramji. Venu inspires her again to achieve her goals and sells off all his property to pay as a donation in a private college for her medical seat. When Nirmala becomes a doctor, she proposes to Venu. The climax of the film is that Venu rejects Nirmala and chooses Radha as his life partner, as Radha's Love had purity, which lacked in Nirmala's Love in the Past."
6501,doc_6501,"After finding out her father and his estate is in danger, Princess Emanuella saves his life by marrying Duke Cathos de Alvia, a grotesque hunchback. She actually is in love with Leonardo, his attractive younger brother. They already had an affair before the marriage, but continue secretly meeting each other. In the end, Cathos finds out about his wife's unfaithfulness and stabs both his wife and brother to death.[1]"
6502,doc_6502,"Pilgrimage to Shiv-sagar is a pious dream for almost the entire country. People from all over India make this pilgrimage. On one such voyage, a steamer was carrying diverse people, people from all walks of life...with various motives and desires, as if they were a cross- section of our whole people. A writer was making the trip to gather materials for his novel. A Sadhuji Maharaj was escorting widows of his ashram to the Holy Confluence. A rich businessman was traveling with his wife and young son. The son had recovered from a mortal disease, and the parents had gratefully promised to offer the Lord of Shiv - sagar their most precious belonging they were carrying with them a very valuable diamond for this offering. There was a dancing girl Roopa, an accomplished beautiful dancer whom her foster mother was taking to the Shiv-sagar to perform her inaugural dance, before entering the career of a dancing girl. When young, she had an attack of polio and her foster mother had vowed that if the girl recovered completely and learned dancing, her inaugural dance would be performed before Lord Shiva of Shiv-sagar. Surrounded by his yes-men a pleasure seeking Zamindar was carrying a cask of wine to throw it in the sea at Shiv-sagar. His wife had died tormented by his infatuation for liquor and by her dead body he had promised an atonement...his decision to throw the chief culprit, the cask of wine, into the sea. To complete the picture of diversity, there were two robbers following the merchant for the diamond. One was the lame informer, who according to their arrangement used to gather information and pass it on to Chhagan, a reckless, faithless, master of his trade. They used to share the booty. But on this particular, mission Chhagan refused, saying that whoever grabs the diamond shall have it...thus they turn into rivals. Faith and superstition, love and hate. jealousy and cowardice, compassion and godliness traveled on board the steamer...affecting, reforming and transforming each other with each splash of wave on the drama of this pilgrimage."


In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def tokenize_text(text):
    return [list(tokenize(doc,lower=True)) for doc in text] #

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#tokenize text
tokenized_text = tokenize_text(input_movies['abstract'])

# Remove Stop Words
data_words_nostops = remove_stopwords(tokenized_text)

#add grams
bigram = gensim.models.Phrases(tokenized_text, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

# See trigram example
print(trigram_mod[bigram_mod[tokenized_text[0]]])

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

['sergeant', 'nico', 'toscani', 'a', 'native', 'of', 'palermo', 'is', 'a', 'detective', 'in', 'the', 'chicago', 'police', 'department', 's', 'vice', 'squad', 'at', 'an', 'early_age', 'he', 'became', 'interested', 'in', 'martial_arts', 'and', 'moved', 'to', 'japan', 'to', 'study', 'in', 'nico', 'was', 'recruited', 'into', 'the', 'cia', 'by', 'special_agent', 'nelson', 'fox', 'and', 'was', 'involved', 'in', 'covert', 'operations', 'on', 'the', 'vietnamese', 'cambodian', 'border', 'during', 'the', 'vietnam_war', 'there', 'he', 'became', 'disgusted', 'with', 'station', 'chief', 'kurt', 'zagon', 'who', 'tortured', 'prisoners', 'a', 'stand', 'off', 'occurred', 'when', 'nico', 'tried', 'to', 'stop', 'a', 'torture', 'session', 'and', 'he', 'left', 'the', 'cia', 'nico', 'returned', 'to', 'chicago', 'joined', 'the', 'cpd', 'and', 'got', 'married', 'nico', 'and', 'his', 'new', 'partner', 'detective', 'delores', 'jacks', 'jackson', 'are', 'now', 'investigating', 'a', 'drug', 'ring', 'and', 'after'

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])
# Human readable format of corpus (term-frequency)
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('accuse', 1), ('american', 1), ('arrest', 1), ('ask', 2), ('become', 2), ('border', 1), ('break', 1), ('burst', 1), ('bust', 2), ('cambodian', 1), ('capture', 1), ('central', 1), ('chief', 1), ('confront', 1), ('covert', 3), ('dealer', 2), ('delore', 1), ('disgusted', 1), ('drug', 1), ('ensue', 1), ('eventually', 1), ('experience', 1), ('explosive', 1), ('family', 1), ('fed', 1), ('federal', 1), ('find', 3), ('get', 1), ('go', 1), ('group', 1), ('gun', 1), ('hold', 1), ('include', 1), ('inform', 1), ('interested', 1), ('interrupt', 1), ('investigate', 2), ('involve', 1), ('jack', 2), ('join', 1), ('kill', 5), ('later', 1), ('leave', 1), ('link', 1), ('man', 3), ('marry', 1), ('meet', 1), ('move', 2), ('new', 1), ('nico', 1), ('occur', 1), ('official', 1), ('operation', 3), ('parish', 1), ('partner', 1), ('planning', 1), ('pressure', 1), ('priest', 5), ('prisoner', 1), ('promise', 1), ('recruit', 1), ('release', 1), ('remain', 1), ('request', 1), ('return', 1), ('reveal', 1), ('ring'

### Gensim LDA model

In [25]:
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

call_backs = ['perplexity', 'coherence', 'topic_diff', 'convergence']
lda = train_gensim_LDA(corpus, dictionary=id2word, num_topics=5, num_passes=10, multicore=False, 
                                   call_backs=call_backs, call_logger='visdom')

In [26]:
def train_gensim_LDA(corpus, dictionary, num_topics=10, num_passes=10,
                         num_iterations=50, alpha='auto', eta='auto',
                         chunksize=2000, multicore=True, num_cores=2,
                         call_backs=[], call_logger='shell'):
        if multicore:
            lda = gensim.models.ldamulticore.LdaMulticore
            if alpha == 'auto':
                alpha = 'symmetric'
            lda_model = lda(corpus=corpus,
                            id2word=dictionary,
                            num_topics=num_topics,
                            passes=num_passes,
                            iterations=num_iterations,
                            alpha=alpha,
                            eta=eta,
                            chunksize=chunksize,
                            workers=num_cores)
        else:
            mets = create_call_backs_helper(corpus, call_logger)
            c_backs = [mets[i] for i in call_backs]
            lda = gensim.models.ldamodel.LdaModel
            lda_model = lda(corpus=corpus,
                            id2word=dictionary,
                            num_topics=num_topics,
                            passes=num_passes,
                            iterations=num_iterations,
                            alpha=alpha,
                            eta=eta,
                            chunksize=chunksize,
                            callbacks=c_backs)
        return lda_model
    

def create_call_backs_helper(corpus, logger):
    '''
    Get a dictionary of callbacks.
    '''
    per_met = PerplexityMetric(corpus=corpus, logger=logger, title="Perplexity")
    coh_met = CoherenceMetric(corpus=corpus, coherence="u_mass", logger=logger, title="Coherence (u_mass)")
    dif_met = DiffMetric(distance="kullback_leibler", logger=logger, title="Topic Difference (kullback_leibler)")
    con_met = ConvergenceMetric(distance="kullback_leibler", logger=logger, title="Convergence (kullback_leibler)")

    return {'perplexity': per_met, 'coherence': coh_met, 'topic_diff': dif_met, 'convergence': con_met}

In [92]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           iterations=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [27]:
pprint(lda.print_topics())
# doc_lda = lda_model[corpus]

In [14]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score. higher the better
coherence_model_lda = CoherenceModel(model=lda, texts=data_lemmatized, dictionary=id2word)#, coherence='c_v'
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.906688788242887

Coherence Score:  0.3197923522141132


In [None]:
# Action, comedy, drama, romance, and musical

In [28]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
# vis

### Mallet model

In [18]:
mallet_path = '/Users/liawany/Documents/git/mallet-2.0.8/bin/mallet'

In [30]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word, random_seed=123)

model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [51]:
pprint(ldamallet.show_topics(formatted=False))

[(0,
  [('leave', 0.020624623996233424),
   ('find', 0.015425963223562031),
   ('give', 0.01190133661165023),
   ('return', 0.011136250686615574),
   ('back', 0.009939577829510084),
   ('night', 0.009272579843582433),
   ('money', 0.009043707985666083),
   ('run', 0.008867149695273469),
   ('job', 0.008677513012999922),
   ('call', 0.008507493918547776)]),
 (1,
  [('kill', 0.009509670170933312),
   ('man', 0.009061605542626328),
   ('order', 0.007376347537650804),
   ('force', 0.0073495974105877),
   ('escape', 0.006680844234010112),
   ('discover', 0.0063932803680817486),
   ('return', 0.006246154669234679),
   ('team', 0.006172591819811144),
   ('attack', 0.00606559131155873),
   ('fight', 0.005323275285557607)]),
 (2,
  [('love', 0.037454612339993594),
   ('family', 0.023891110490425162),
   ('fall', 0.015634473503143925),
   ('mother', 0.015586786655857648),
   ('marry', 0.015409664080222901),
   ('friend', 0.014524051202049172),
   ('meet', 0.013692937577916902),
   ('life', 0.013

In [32]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.37565674127449195


In [52]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word, sort_topics=False)
# vis = pyLDAvis.prepared_data_to_html(vis)
# with open("LDA_output.html", "w") as file:
#     file.write(vis)
vis
# Action, comedy, drama, romance, and musical


### Find the most representative document for each topic

In [59]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(5)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
#sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text", "abstract"]

# Show
#sent_topics_sorteddf_mallet[sent_topics_sorteddf_mallet['Dominant_Topic'] == 4]

In [68]:
sent_topics_sorteddf_mallet.head(1)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,abstract
0,2897,0.0,0.61,"leave, find, give, return, back, night, money, run, job, call","[find, hit, player, strike, season, still, need, finish, sentinel, call, former, coach, tell, mcginty, rest, team, go, finish, final, game, season, replacement, player, ask, mcginty, coach, sentinel, rest, season, pressure, win, last, game, make, playoff, mcginty, accept, condition, also, give, freedom, sign, player, want, neil, allow, interfere, neil, accept, request, mcginty, build, team, different, varying, player, believe, make, win, team, choose, former, career, go, piece, horrendous, game, live, houseboat, initially, refuse, mcginty, convince, believe, still, player, mean, replacement, player, greet, first, practice, hostilely, strike, player, call, replacement, throw, arrive, late, get, truck, turn, find, new, cheerleader, original, apparently, go, strike, reluctantly, hire, stripper, tryout, go, terribly, bad, practice, drive, surprise, vast, football, knowledge, replacement, game, team, initially, struggle, get, cause, sentinel, fall, early, try, team, back, last, play, panic, see, pende, blitz, call, audible, fall, berate, tell, winner, always, want, ball, game, line, local, bar, several, replacement, lament, loss, several, strike, player, lead, quarterback, taunt, replacement, stand, follow, lead, replacement, arrest, build, bond, process, dance, together, cell, mcginity, meet, hear, happen, tell, first, quarterback, see, long, time, selfless, connection, start, grow, sentinel, next, game, fall, able, come, together, time, win, player, name, meet, run, use, admit, raise, football, share, short, conversation, beer, together, consummate, feeling, share, deep, kiss, sentinel, nearly, lose, next, game, road, win, couple, improbable, play, sentinel, tell, cross, picket, line, point, entire, team, league, defend, champion, sentinel, next, opponent, dalla, cross, well, neil, show, confidence, able, beat, hint, mcginity, could, fire, mcginty, refuse, give, reluctantly, tell, tell, teammate, thing, demoralizing, team, toast, teammate, unable, face, happen, leave, stand, plan, date, first, final, crucial, game, deliberately, throw, game, replacement, player, also, smugly, ignore, play, call, mcginty, make, cause, sentinel, fall, hometown, fan, initially, despise, replacement, accept, favorite, way, tell, tv, reporter, team, need, heart, come, back, win, earlier, say, watch, television, return, mcginty, promptly, bench, angrily, tell, never, know, replacement, player, say, live, rest, team, throw, field, find, apologize, give, deep, kiss, front, crowd, cheerleader, tell, replacement, strike, officially, end, next_day, give, player, incentive, give, left, sentinel, rally, back, score, call, game, field, goal, late, game, however, owe, realize, want, throw, game, take, pub, compensation, hint, pull, ball, away, cause, fall, momentum, initially, score, apparent, call, sentinel, penalty, unable, continue, tell, mcginty, want, ball, affirm, mcginty, tell, call, deep, pass, replacement, deaf, tight, end, pass, time, expire, earn, sentinel, playoff, celebrate, mcginty, narrate, replacement, player, leave, field, satisfaction, personal, glory, accomplish, live, athlete, dream, second, chance, watch, replacement]","A fictional pro football league finds themselves hit with a players' strike with the season still needing to be finished. Washington Sentinels[1][2] owner Edward O'Neil calls a former coach of his, Jimmy McGinty, telling McGinty that he and the rest of the teams are going to finish the final four games of the season with replacement players. O'Neil asks McGinty to coach the Sentinels the rest of the season, along with the pressure of winning three of the last four games to make the playoffs. McGinty accepts, on the condition that he will also be given the freedom to sign the players he wants with O'Neil not allowed to interfere.\r\nWith O'Neil accepting his requests, McGinty builds his team of different varying players that he believes can make a winning team. As his quarterback, McGinty chooses Shane Falco, a former All-American from Ohio State whose career went to pieces after a horrendous Sugar Bowl game, and now lives in a houseboat near the Sentinels' stadium. Falco initially refuses, but McGinty convinces him, believing that Falco can still be the player he was meant to be. The replacement players are greeted to their first practice hostilely by the striking players, calling the replacements ""scabs"", and throwing eggs at them, and Falco, who arrives late, gets his truck turned over. Head cheerleader Annabelle Ferrell, who has to find new cheerleaders since the originals apparently went on strike as well, reluctantly hires strippers when the other tryouts go terribly bad. After practice, Annabelle drives Falco home and surprises him with her vast football knowledge.\r\nThe replacements' first game is against Detroit, and the team initially struggles to get along, causing the Sentinels to fall behind early. Falco tries to rally the team back, but on the last play, he panics when he sees a pending blitz and calls an audible, which falls short of the winning touchdown. McGinty berates Falco for what he did, telling him that ""winners always want the ball when the game's on the line."" At a local bar, several of the replacements lament over their loss, when several of the striking players, led by their prima donna quarterback Eddie Martel, arrive and taunt the replacements. When Falco stands up to Martel, a brawl follows, leading to the replacements being arrested, but they build a bond in the process, dancing together in their cell before McGinity bails them out. Annabelle meets Shane the next day, having heard what happened, and tells him that he's the first quarterback she's seen in a long time be so selfless, and a connection starts to grow with the two of them.\r\nIn the Sentinels' next game against San Diego, they fall behind again but are able to come together once again, and this time win, on a 65-yard field goal by their kicker, a Welsh soccer player named Nigel Gruff. Falco meets Annabelle again, where she runs a bar her father used to own and admits that she was raised with football. After sharing a short conversation and having a beer together, they consummate their feelings for one another, sharing a deep kiss. The Sentinels nearly lose their next game on the road against Phoenix, but win on a couple of improbable plays.\r\nWhen the Sentinels return to DC, O'Neil tells McGinty that Eddie Martel has crossed the picket line, and points out that the entire team of the league's defending champions, and the Sentinels' next opponent, Dallas, have crossed as well. O'Neil shows no confidence in Falco being able to beat Dallas, and hints to McGinity that he could be fired if McGinty refuses to start Martel. McGinty gives in and reluctantly tells Falco, who then tells his teammates the same thing, demoralizing the team. Falco is toasted by his teammates, but unable to face Annabelle after what happened, Falco leaves her stood up for their planned date.\r\nIn the first half of the final crucial game, Martel deliberately throws the game to humiliate the replacement players, and also smugly ignores any play calls McGinty makes, causing the Sentinels fall behind to Dallas 17–0. The hometown fans, who had initially despised the replacements, now boo Martel, having accepted Falco as their favorite. On the way to the locker room for halftime, McGinty tells a TV reporter that the team needs ""heart"" to come back and win, something he had earlier said Falco had. Falco, watching this on television, returns to the stadium, and McGinty promptly benches Martel for Falco. Martel angrily tells Falco that he will never be known as anything but a replacement player. Falco says he can live with that and the rest of the team throws Martel out of the stadium. On his way back to the field, Falco finds Annabelle and apologizes to her, giving her another deep kiss in front of the crowd and other cheerleaders.\r\nMcGinty tells the replacements that the strike will officially end the next day, giving the players incentive to give everything they have left. The Sentinels rally back to a 17-14 score, with Gruff being called to kick the game-tying field goal late in the game. However, Gruff spots bookies that he owes money to in the crowd, and realizes that they want him to throw the game or they'll take his pub from him as compensation. He hints this to Falco just before the kick, and Falco pulls the ball away, causing Gruff to fall from the momentum of his kicking motion and break his arm. Falco initially scores the apparent winning touchdown, but it's called back on a Sentinels penalty. With Gruff unable to continue, Falco tells McGinty that he ""wants the ball"", affirming what McGinty had told him before. Falco calls for a deep pass to the replacements' deaf tight end, Brian Murphy, and hits him with the game-winning touchdown pass as time expires, earning the Sentinels a playoff berth. Falco celebrates with Annabelle, while McGinty narrates that the replacement players left the field with nothing but the satisfaction and personal glory of what they've accomplished, which is living the athlete's dream of a ""second chance."" He then watches the replacements dance on the field to the Gloria Gaynor song ""I Will Survive""."


### Finding the dominant topic in each movie

In [81]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic = pd.concat([df_dominant_topic, input_movies], axis=1)

In [92]:
# df_dominant_topic.tail(3)

In [70]:
my_dic = {0: [0, 'comedy', 1], \
          1: [1, 'action', 0], \
          2: [2, 'romance', 3], \
          3: [3, 'musical', 4], \
          4: [4, 'drama', 2]
         }
mapping_label = pd.DataFrame.from_dict(my_dic, orient='index',columns=['Dominant_Topic', 'label_name', 'label_idx'])

In [85]:
df = df_dominant_topic.merge(mapping_label, on='Dominant_Topic', how='left')[['ids','label_name', 'label_idx']]

In [91]:
df.head(3)

Unnamed: 0,ids,label_name,label_idx
0,doc_0,action,0
1,doc_1,musical,4
2,doc_2,drama,2


In [87]:
df['label_name'].value_counts()

romance    1744
musical    1482
comedy     1118
drama      1101
action     1058
Name: label_name, dtype: int64

In [90]:
df.shape

(6503, 3)

In [89]:
df.to_csv('liawany_vanessa_liao_results.csv', index=False)