# LDA topic modelling
The notebook on LDA topic modelling for IMDB dataset using PySpark is a comprehensive guide that demonstrates how to perform topic modelling using Latent Dirichlet Allocation (LDA) in PySpark.

In [7]:
import findspark
findspark.init()
from pyspark.sql import functions as F
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext("local[*]")
spark = SparkSession.builder.getOrCreate()

In [94]:
# There are some warrings, so we supress them
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load and preprocess data

In [47]:
import re
import numpy as np
tFile="data\IMDB Dataset.csv.bz2"
df = spark.read.csv(tFile,header=True)
df.show(3)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
+--------------------+---------+
only showing top 3 rows



In [48]:
df = df.sample(.2, seed=100)
#df= df.where(F.col("sentiment")=="positive")

In [49]:
df.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive| 5067|
| negative| 4993|
+---------+-----+



In [108]:
# Remove html tags from text
df = df.withColumn("text_c", F.regexp_replace(F.col("text"), r'<[^>]+>', ""));
# Remove non-letters
df = df.withColumn("text_c", F.regexp_replace("text_c", r"[\.\!\,\-\']", " "))
# Remove non-letters
df = df.withColumn("text_c", F.regexp_replace("text_c", r"[^a-zA-Z\ ]", ""))
# Remove words 1, 2 char
df = df.withColumn("text_c", F.regexp_replace("text_c", r"\b\w{1,2}\b", " "))
df.toPandas().head(5)

Unnamed: 0,text,sentiment,text_c
0,A wonderful little production. <br /><br />The...,positive,wonderful little production The filming tec...
1,"Probably my all-time favorite movie, a story o...",positive,Probably all time favorite movie story ...
2,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection d...
3,"This show was an amazing, fresh & innovative i...",negative,This show was amazing fresh innovative ide...
4,The cast played Shakespeare.<br /><br />Shakes...,negative,The cast played Shakespeare Shakespeare lost ...


In [109]:
import spacy
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Load the spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Define a function to apply the lemmatizer to a text
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)

# Define a UDF to apply the lemmatizer to a column
lemmatize_udf = udf(lemmatize_text, StringType())

# Apply the UDF to a DataFrame column
df = df.withColumn("text_c", lemmatize_udf(df["text_c"]))

# Caching must be used !!!!!!
df = df.cache()
df.toPandas().head(5)

Unnamed: 0,text,sentiment,text_c
0,A wonderful little production. <br /><br />The...,positive,wonderful little production the film tech...
1,"Probably my all-time favorite movie, a story o...",positive,probably all time favorite movie story ...
2,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection ...
3,"This show was an amazing, fresh & innovative i...",negative,this show be amazing fresh innovative i...
4,The cast played Shakespeare.<br /><br />Shakes...,negative,the cast play Shakespeare Shakespeare lose ...


In [64]:
from pyspark.ml.feature import Tokenizer, CountVectorizer,IDF
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.ml.clustering import LDA

# Data preprocessing

In [110]:
# Text preprocessin pipeline
tokenizer = Tokenizer(inputCol="text_c", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
# Run 1 Use 500 words
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=500)
# Run 2 Use 1000 words
#countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000)
# Run 3 Use Filter most frequent words
#countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000,minDF=10, maxDF=5000)

idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="features")
pipeline = Pipeline(stages=[tokenizer,remover, countVectorizer,idf])
data_model = pipeline.fit(df)

In [111]:
vocabulary = data_model.stages[2].vocabulary
print(vocabulary[:100])

['', 'movie', 'film', 'one', 'see', 'make', 'like', 'good', 'get', 'well', 'time', 'character', 'watch', 'bad', 'even', 'story', 'really', 'think', 'show', 'scene', 'great', 'look', 'much', 'say', 'know', 'people', 'go', 'also', 'take', 'give', 'first', 'way', 'end', 'love', 'thing', 'play', 'come', 'find', 'man', 'life', 'seem', 'work', 'actor', 'plot', 'two', 'year', 'many', 'want', 'never', 'little', 'try', 'ever', 'act', 'still', 'feel', 'back', 'part', 'use', 'something', 'old', 'real', 'funny', 'lot', 'director', 'didn', 'guy', 'woman', 'performance', 'leave', 'star', 'another', 'big', 'doesn', 'role', 'though', 'young', 'nothing', 'actually', 'start', 'tell', 'point', 'new', 'long', 'day', 'every', 'cast', 'world', 'become', 'girl', 'fact', 'turn', 'comedy', 'pretty', 'horror', 'set', 'action', 'kill', 'however', 'enough', 'around']


In [112]:
dataset = data_model.transform(df)
dataset.toPandas().tail(5)

Unnamed: 0,text,sentiment,text_c,words,filtered,features_c,features
10055,"As someone who loves baseball history, especia...",negative,someone who love baseball history especia...,"[, , , someone, who, love, baseball, history, ...","[, , , someone, love, baseball, history, , , e...","(237.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 0.0, 0.0...","(0.023557477281972106, 0.4314942162337284, 1.0..."
10056,<br /><br />Headlines warn us of the current c...,positive,headline warn the current campaign dem...,"[headline, warn, , , , , , the, current, campa...","[headline, warn, , , , , , current, campaign, ...","(167.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0...","(0.016599572599533086, 0.0, 0.0, 0.0, 1.244164..."
10057,"Dog Bite Dog isn't going to be for everyone, b...",positive,Dog Bite Dog isn go for everyone but...,"[dog, bite, dog, isn, , , , go, , , , , , for,...","[dog, bite, dog, isn, , , , go, , , , , , ever...","(285.0, 0.0, 2.0, 2.0, 1.0, 0.0, 2.0, 1.0, 0.0...","(0.02832861192135886, 0.0, 1.0698027149082745,..."
10058,This is your typical junk comedy.<br /><br />T...,negative,this your typical junk comedy there be almo...,"[this, , , , your, typical, junk, comedy, ther...","[, , , typical, junk, comedy, almost, , , , la...","(132.0, 4.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0...","(0.013120620258313578, 1.7259768649349136, 0.5..."
10059,I thought this movie did a down right good job...,positive,think this movie do down right good job ...,"[, , , think, this, movie, do, , , , down, rig...","[, , , think, movie, , , , right, good, job, ,...","(204.0, 6.0, 0.0, 1.0, 3.0, 0.0, 2.0, 1.0, 2.0...","(0.02027732221739371, 2.5889652974023702, 0.0,..."


# LDA
LDA stands for Latent Dirichlet Allocation, which is a generative statistical model used for topic modelling. Topic modelling is a technique used to discover latent topics or themes in a collection of documents or texts.

The key idea behind LDA is to consider each document as a bag of words, where the order of the words does not matter. LDA uses a Bayesian approach to infer the topic distribution for each document and the word distribution for each topic.

The LDA algorithm works by randomly assigning each word in a document to a topic and then iteratively updating the topic assignments based on the probabilities of each word belonging to each topic. The algorithm continues to iterate until it converges to a stable state, where the topic assignments are optimal.

In [113]:
# Find two topics
lda = LDA(k=2, maxIter=20)
model = lda.fit(dataset)

In [114]:
# Print the LDA transformation matrix
model.topicsMatrix()

DenseMatrix(500, 2, [42.6187, 1802.1074, 2255.5155, 1309.7856, 1054.2052, 1289.4113, 1158.5253, 1240.7307, ..., 376.7221, 295.0634, 373.723, 414.4773, 190.8515, 351.6969, 322.8845, 432.6327], 0)

In [115]:
# Describe topics
topics = model.describeTopics(5)
print("The topics described by their top-weighted terms:")
topics.toPandas().head(5)

The topics described by their top-weighted terms:


Unnamed: 0,topic,termIndices,termWeights
0,0,"[2, 1, 13, 93, 8]","[0.008222578498306203, 0.006569659759980604, 0..."
1,1,"[2, 1, 18, 33, 12]","[0.00836036856801242, 0.00785771055899993, 0.0..."


In [116]:
# Print most important words per topic
topics = model.describeTopics(15)
for r in topics.select("termIndices").collect():
    rez = []
    for l in r:
        for i in l:
            rez.append(vocabulary[i])
    print(rez[:15])

['film', 'movie', 'bad', 'horror', 'get', 'book', 'story', 'character', 'one', 'woman', 'make', 'man', 'look', 'director', 'good']
['film', 'movie', 'show', 'love', 'watch', 'see', 'like', 'good', 'funny', 'one', 'really', 'time', 'character', 'make', 'think']


# Create LDA model wiht ten topics

In [117]:
# Text preprocessin pipeline
tokenizer = Tokenizer(inputCol="text_c", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered")
# Run 1: Use all the words
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000)
# Run 2: Discard the very frequent words
# countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features_c", vocabSize=1000,minDF=10, maxDF=3000)

idf = IDF(inputCol=countVectorizer.getOutputCol(), outputCol="features")
pipeline = Pipeline(stages=[tokenizer,remover, countVectorizer,idf])
data_model = pipeline.fit(df)

In [118]:
vocabulary = data_model.stages[2].vocabulary
print(vocabulary[:100])

['', 'movie', 'film', 'one', 'see', 'make', 'like', 'good', 'get', 'well', 'time', 'character', 'watch', 'bad', 'even', 'story', 'really', 'think', 'show', 'scene', 'great', 'look', 'much', 'say', 'know', 'people', 'go', 'also', 'take', 'give', 'first', 'way', 'end', 'love', 'thing', 'play', 'come', 'find', 'man', 'life', 'seem', 'work', 'actor', 'plot', 'two', 'year', 'many', 'want', 'never', 'little', 'try', 'ever', 'act', 'still', 'feel', 'back', 'part', 'use', 'something', 'old', 'real', 'funny', 'lot', 'director', 'didn', 'guy', 'woman', 'performance', 'leave', 'star', 'another', 'big', 'doesn', 'role', 'though', 'young', 'nothing', 'actually', 'start', 'tell', 'point', 'new', 'long', 'day', 'every', 'cast', 'world', 'become', 'girl', 'fact', 'turn', 'comedy', 'pretty', 'horror', 'set', 'action', 'kill', 'however', 'enough', 'minute']


In [119]:
dataset = data_model.transform(df)
dataset.toPandas().head(5)

Unnamed: 0,text,sentiment,text_c,words,filtered,features_c,features
0,A wonderful little production. <br /><br />The...,positive,wonderful little production the film tech...,"[, , , wonderful, little, production, , , the,...","[, , , wonderful, little, production, , , film...","(92.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0,...","(0.009144674725491282, 0.0, 0.5349013574541372..."
1,"Probably my all-time favorite movie, a story o...",positive,probably all time favorite movie story ...,"[probably, , , , all, time, favorite, movie, ,...","[probably, , , , time, favorite, movie, , , , ...","(124.0, 2.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0...","(0.012325431151749118, 0.8629884324674568, 0.0..."
2,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection ...,"[, , , sure, would, like, , , , see, , , , res...","[, , , sure, like, , , , see, , , , resurrecti...","(151.0, 1.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 1.0...","(0.015009194386404169, 0.4314942162337284, 0.0..."
3,"This show was an amazing, fresh & innovative i...",negative,this show be amazing fresh innovative i...,"[this, show, be, , , , amazing, , , fresh, , ,...","[show, , , , amazing, , , fresh, , , innovativ...","(148.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0...","(0.014710998471442496, 0.0, 0.0, 0.55807637111..."
4,The cast played Shakespeare.<br /><br />Shakes...,negative,the cast play Shakespeare Shakespeare lose ...,"[the, cast, play, shakespeare, shakespeare, lo...","[cast, play, shakespeare, shakespeare, lose, ,...","(99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...","(0.009840465193735184, 0.4314942162337284, 0.0..."


In [120]:
# Find two topics
lda = LDA(k=10, maxIter=20)
model = lda.fit(dataset)

In [121]:
# Describe topics
topics = model.describeTopics(5)
print("The topics described by their top-weighted terms:")
topics.toPandas()

The topics described by their top-weighted terms:


Unnamed: 0,topic,termIndices,termWeights
0,0,"[135, 423, 158, 223, 2]","[0.01669425554836518, 0.006652984717492162, 0...."
1,1,"[38, 75, 215, 463, 216]","[0.006959913058347865, 0.006590408892522312, 0..."
2,2,"[430, 351, 1, 942, 384]","[0.009792843428805197, 0.008521535201288306, 0..."
3,3,"[2, 18, 156, 1, 12]","[0.006819183943775015, 0.005001930988926309, 0..."
4,4,"[566, 156, 504, 828, 707]","[0.01846624414614904, 0.012270222027559513, 0...."
5,5,"[115, 2, 665, 20, 33]","[0.007011647844973934, 0.006653196063556585, 0..."
6,6,"[88, 18, 452, 108, 187]","[0.009013184363177103, 0.007708901834386116, 0..."
7,7,"[290, 112, 518, 264, 2]","[0.010307983879474753, 0.00827130795104214, 0...."
8,8,"[2, 413, 850, 203, 809]","[0.008496489269192841, 0.008432247014783758, 0..."
9,9,"[13, 1, 61, 2, 300]","[0.009881758450568306, 0.007700914733336307, 0..."


In [122]:
# Print most important words per topic
topics = model.describeTopics(10)
for r in topics.select("termIndices").collect():
    rez = []
    for l in r:
        for i in l:
            rez.append(vocabulary[i])
    print(rez[:15])

['book', 'novel', 'read', 'house', 'film', 'character', 'movie', 'animation', 'story', 'adaptation']
['man', 'young', 'father', 'cop', 'wife', 'woman', 'town', 'play', 'small', 'family']
['sequel', 'evil', 'movie', 'opera', 'michael', 'production', 'hell', 'horror', 'gore', 'let']
['film', 'show', 'episode', 'movie', 'watch', 'know', 'woman', 'comedy', 'feel', 'character']
['zombie', 'episode', 'season', 'gay', 'party', 'bad', 'list', 'lady', 'hurt', 'annoying']
['series', 'film', 'vampire', 'great', 'love', 'human', 'oscar', 'story', 'jack', 'character']
['girl', 'show', 'dance', 'friend', 'short', 'cartoon', 'brother', 'boy', 'tom', 'dog']
['game', 'music', 'match', 'song', 'film', 'movie', 'musical', 'good', 'western', 'show']
['film', 'documentary', 'south', 'view', 'political', 'life', 'real', 'van', 'show', 'war']
['bad', 'movie', 'funny', 'film', 'flick', 'like', 'even', 'scene', 'see', 'watch']


# Topic classification

In [123]:
# Shows the result
transformed = model.transform(dataset)
transformed.select("text_c","topicDistribution").toPandas().head(5)

Unnamed: 0,text_c,topicDistribution
0,wonderful little production the film tech...,"[0.000795663265817789, 0.0008133383484829792, ..."
1,probably all time favorite movie story ...,"[0.4654967687392227, 0.0012263953106929593, 0...."
2,sure would like see resurrection ...,"[0.0009018104685262996, 0.16456403755126245, 0..."
3,this show be amazing fresh innovative i...,"[0.06097570247306475, 0.0006560164287984262, 0..."
4,the cast play Shakespeare Shakespeare lose ...,"[0.0012813788022620492, 0.0013097957148366583,..."


In [124]:
from pyspark.sql.functions import udf
@udf
def vect_argmax(row):
    row_arr = row.toArray()
    max_pos = np.argmax(row_arr)
    return(int(max_pos))
transformed1 = transformed.withColumn("argmax",vect_argmax(F.col('topicDistribution')))

In [125]:
transformed1.select("text_c","argmax").toPandas().head(5)

Unnamed: 0,text_c,argmax
0,wonderful little production the film tech...,8
1,probably all time favorite movie story ...,0
2,sure would like see resurrection ...,8
3,this show be amazing fresh innovative i...,3
4,the cast play Shakespeare Shakespeare lose ...,9
