# 0. Libraries and spark session

In [1]:
pip install spark-nlp==3.3.4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy pandas nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
spark

In [4]:
from sparknlp.annotator import LemmatizerModel
import pyspark.sql.functions as f
from pyspark.sql import Window
import pyspark.sql.types as t
from pyspark.ml.feature import Tokenizer as pysparkTokenizer, HashingTF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, PerceptronModel, Chunker
from pyspark.ml.clustering import LDA
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1. Loading the data

In [5]:
df = spark.read.option("tableName", "Tweets").format("dynamodb").load().select(f.col("text"))\
.union(
    spark.read.option("tableName", "RedditPosts").format("dynamodb").load().select(f.col("submission_id").alias("text"))
)
df.show()
f"Lenght: {df.count()}"

+--------------------+
|                text|
+--------------------+
|RT @votetimscott:...|
|RT @HouseGOP: htt...|
|RT @HouseGOP: Thi...|
|RT @POTUS: When I...|
|RT @POTUS: Here’s...|
|RT @POTUS: My Adm...|
|@JoeBiden and wea...|
|@RandPaul https:/...|
|@RandPaul Also, m...|
|RT @SenWarren: Th...|
|@amyklobuchar SER...|
|@VP I am old enou...|
|RT @VP: There is ...|
|RT @POTUS: Folks,...|
|RT @JoeBiden: As ...|
|RT @JoeBiden: The...|
|@POTUS Just becau...|
|RT @POTUS: Here’s...|
|RT @SenSchumer: W...|
|RT @POTUS: My Adm...|
+--------------------+
only showing top 20 rows



'Lenght: 7379'

Removing special chacaters and changing to lowercase

In [6]:
df = df.withColumn("text", f.lower(f.regexp_replace(f.col("text"), "[^A-Za-z0-9@ ]", "")))
df.show()

+--------------------+
|                text|
+--------------------+
|rt @votetimscott ...|
|rt @housegop http...|
|rt @housegop this...|
|rt @potus when i ...|
|rt @potus heres w...|
|rt @potus my admi...|
|@joebiden and wea...|
|@randpaul httpstc...|
|@randpaul also ma...|
|rt @senwarren the...|
|@amyklobuchar ser...|
|@vp i am old enou...|
|rt @vp there is a...|
|rt @potus folks i...|
|rt @joebiden as w...|
|rt @joebiden the ...|
|@potus just becau...|
|rt @potus heres w...|
|rt @senschumer wi...|
|rt @potus my admi...|
+--------------------+
only showing top 20 rows



# 2. LDA - topics analysis - Pyspark only

In [7]:
tokenizer = pysparkTokenizer(inputCol="text", outputCol="tokens")
stopwords_cleaner = StopWordsRemover(inputCol="tokens", outputCol="no stop words")
nlp_pipeline = Pipeline(
    stages=[tokenizer,
            stopwords_cleaner])
nlp_model = nlp_pipeline.fit(df)
processed_df  = nlp_model.transform(df)
processed_df.show(5)
processed_df.limit(3).toPandas().to_dict("records")

+--------------------+--------------------+--------------------+
|                text|              tokens|       no stop words|
+--------------------+--------------------+--------------------+
|rt @votetimscott ...|[rt, @votetimscot...|[rt, @votetimscot...|
|rt @housegop http...|[rt, @housegop, h...|[rt, @housegop, h...|
|rt @housegop this...|[rt, @housegop, t...|[rt, @housegop, j...|
|rt @potus when i ...|[rt, @potus, when...|[rt, @potus, took...|
|rt @potus heres w...|[rt, @potus, here...|[rt, @potus, here...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



[{'text': 'rt @votetimscott one of the highlights of my year was giving the official @gop response to joe bidens first address to congress i said i',
  'tokens': ['rt',
   '@votetimscott',
   'one',
   'of',
   'the',
   'highlights',
   'of',
   'my',
   'year',
   'was',
   'giving',
   'the',
   'official',
   '@gop',
   'response',
   'to',
   'joe',
   'bidens',
   'first',
   'address',
   'to',
   'congress',
   'i',
   'said',
   'i'],
  'no stop words': ['rt',
   '@votetimscott',
   'one',
   'highlights',
   'year',
   'giving',
   'official',
   '@gop',
   'response',
   'joe',
   'bidens',
   'first',
   'address',
   'congress',
   'said']},
 {'text': 'rt @housegop httpstcosaozqqvrmn',
  'tokens': ['rt', '@housegop', 'httpstcosaozqqvrmn'],
  'no stop words': ['rt', '@housegop', 'httpstcosaozqqvrmn']},
 {'text': 'rt @housegop this is joe bidens america httpstco9yhn34psbr',
  'tokens': ['rt',
   '@housegop',
   'this',
   'is',
   'joe',
   'bidens',
   'america',
   'httpst

In [8]:
cv = CountVectorizer(inputCol="no stop words", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['', 'new', '@potus', 'one', 'jobs', 'president', '6', 'million', 'year', 'rt', 'record', 'stand', 'unemployment', 'heres', 'nearly'] 

['rt', '@potus', 'covid19', '', '@randpaul', 'high', 'booster', 'increase', 'shots', 'protection', 'antibody', 'degree', 'severe', 'providing', 'levels'] 

['@potus', '@vp', 'rt', 'get', 'people', '@randpaul', 'covid', 'dont', 'biden', 'joe', 'voting', 'amp', 'know', 'us', '@joebiden'] 

['rt', '@senwarren', 'care', 'americans', 'want', 'law', 'court', '@randpaul', '70', 'supreme', 'ready', 'radical', 'land', 'roevwade', 'ove'] 

['@joebiden', 'rt', 'way', 'best', 'continue', 'protect', 'year', 'remember', 'health', 'head', 'holiday', 'new', 'celebrations', 'kee', '@tedcruz'] 



# 3. LDA - topics analysis - SparkNLP

In [9]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol('document')
tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized')
lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized'])\
.setOutputCol('unigrams').setStopWords(stopwords.words('english'))
pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'unigrams']).setOutputCol('pos')
chunker = Chunker().setInputCols(['document', 'pos']).setOutputCol('ngrams').setRegexParsers(['<JJ>+<NN>', '<NN>+<NN>'])
finisher = Finisher().setInputCols(['unigrams', 'ngrams'])
pipeline = Pipeline() \
     .setStages([documentAssembler,
                 tokenizer,
                 normalizer,
                 lemmatizer,
                 stopwords_cleaner,
                 pos_tagger,
                 chunker,
                 finisher])
processed_df = pipeline.fit(df).transform(df).withColumn("final", f.concat("finished_unigrams", "finished_ngrams"))
processed_df.show()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
+--------------------+--------------------+--------------------+--------------------+
|                text|   finished_unigrams|     finished_ngrams|               final|
+--------------------+--------------------+--------------------+--------------------+
|rt @votetimscott ...|[rt, votetimscott...|[official @go, hi...|[rt, votetimscott...|
|rt @housegop http...|[rt, housegop, ht...|[@housegop httpst...|[rt, housegop, ht...|
|rt @housegop this...|[rt, housegop, jo...|                  []|[rt, housegop, jo...|
|rt @potus when i ...|[rt, potus, take,...|[americans have m...|[rt, potus, take,...|
|rt @potus heres w...|[rt, potus, stand...|[new jobs, new pr...|[rt, potus, stand...|
|rt @potus my admi...|[rt, potus, admin...|[covid19 in their...|[rt, potus, admin...|
|@joebiden and wea...|[joebiden, we

In [10]:
cv = CountVectorizer(inputCol="final", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['potus', 'joebiden', 'vp', 'get', 'rt', 'people', 'make', 'go', 'dont', 'covid', 'economy', 'create', 'need', 'say', 'know'] 

['new', 'rt', 'year', 'potus', 'one', 'president', 'job', 'million', 'stand', 'record', 'nearly', 'new jobs', 'unemployment', 'new president', 'jobs  a record'] 

['senwarren', 'american', 'rt', 'want', 'law', 'court', 'supreme', 'ready', 'radical', 'supreme court', 'remain', 'land', 'ove', 'roevwade', 'ready to pitch'] 

['rt', 'randpaul', 'tedcruz', 'vaccine', 'vote', 'care', 'senronjohnson', 'would', 'covid', 'mandate', 'solution', 'right', 'amyklobuchar', 'health', 'treatment'] 

['rt', 'help', 'pandemic', 'something', 'use', 'end', 'vp', 'power', 'lot', 'last', 'individual', 'federal', 'frustrate', 'every', 'individual power'] 

