## Preparing the data

In [2]:
import pandas as pd
df = pd.read_csv('../Common/rly_final_movies.tsv', sep='\t')

In [3]:
df["keywords"] = df["keywords"].fillna('')
df["overview"] = df["overview"].fillna('')
df["synopsis"] = df["synopsis"].fillna('')
df["text"] = df["overview"] + " " + df["synopsis"] + " " + df["keywords"]

In [4]:
documents = df["text"].to_list()

In [5]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = [word for word in text.lower().split() if word not in stop_words]
    #words = [stemmer.stem(word) for word in words]
    return words

texts = [preprocess(doc) for doc in documents]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Using the LDA model

In [6]:
from gensim import corpora
from gensim.models import LdaModel

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = LdaModel(corpus=corpus, num_topics=30, id2word=dictionary, passes=10)

In [9]:
import pandas as pd

topics = []
for idx, topic in lda.print_topics(-1):
    words = [word.split("*")[1].strip().strip('"') for word in topic.split(" + ")]
    topics.append([idx, ", ".join(words[:5])])

df = pd.DataFrame(topics, columns=["Topic", "Top Keywords"])
print(df)


    Topic                                      Top Keywords
0       0                    story, based, team, world, new
1       1                     road, trip, one, embark, find
2       2              hollywood, actor, frank, oil, beings
3       3                    town, small, rock, band, jesus
4       4               alien, must, earth, fight, invasion
5       5               france, paris, century, one, dragon
6       6                          show, joe, cat, drug, tv
7       7                  drug, police, bank, murder, gang
8       8              life, wife, woman, husband, marriage
9       9               christmas, dog, holiday, santa, eve
10     10          agent, cia, secret, agency, intelligence
11     11              school, high, college, life, friends
12     12               angeles, los, game, two, california
13     13                    war, world, mission, space, us
14     14      africa, martial, australia, australian, arts
15     15               based, witch, no

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(LDAvis_prepared, 'lda_visualization.html')

In [16]:
topic_distributions = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus]
pdf = pd.DataFrame([[prob for _, prob in doc] for doc in topic_distributions], columns=[f"Topic {i}" for i in range(30)])

pdf['tconst'] = df['tconst'].values  # Add tconst column
pdf = pdf[['tconst'] + [f"Topic {i}" for i in range(30)]]  # Reorder columns

print(pdf.head())

      tconst   Topic 0   Topic 1   Topic 2   Topic 3   Topic 4   Topic 5  \
0  tt0156812  0.000654  0.375615  0.000654  0.119675  0.000654  0.000654   
1  tt0195945  0.000607  0.000607  0.224562  0.000607  0.000607  0.000607   
2  tt0134983  0.000539  0.218176  0.000539  0.000539  0.000539  0.000539   
3  tt0186975  0.035840  0.000375  0.185359  0.609671  0.000375  0.000375   
4  tt0195234  0.000557  0.000557  0.000557  0.239240  0.022716  0.000557   

    Topic 6   Topic 7   Topic 8  ...  Topic 20  Topic 21  Topic 22  Topic 23  \
0  0.000654  0.000654  0.000654  ...  0.000654  0.000654  0.000654  0.000654   
1  0.000607  0.000607  0.000607  ...  0.000607  0.000607  0.000607  0.000607   
2  0.000539  0.000539  0.000539  ...  0.000539  0.000539  0.000539  0.000539   
3  0.000375  0.000375  0.000375  ...  0.000375  0.000375  0.000375  0.000375   
4  0.106732  0.000557  0.000557  ...  0.000557  0.000557  0.000557  0.000557   

   Topic 24  Topic 25  Topic 26  Topic 27  Topic 28  Topic 29 

In [17]:
pdf.to_csv("lda_topics.tsv", sep='\t', index=False)