# 0. Libraries and spark session

In [1]:
pip install spark-nlp==3.3.4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy pandas nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
spark

In [4]:
from sparknlp.annotator import LemmatizerModel
import pyspark.sql.functions as f
from pyspark.sql import Window
import pyspark.sql.types as t
from pyspark.ml.feature import Tokenizer as pysparkTokenizer, HashingTF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, PerceptronModel, Chunker
from pyspark.ml.clustering import LDA
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1. Loading the data

In [6]:
df = spark.read.option("tableName", "Tweets").format("dynamodb").load().select(f.col("text"))\
.union(
    spark.read.option("tableName", "RedditPosts").format("dynamodb").load().select(f.col("submission_id").alias("text"))
)
df.show()
f"Lenght: {df.count()}"

+--------------------+
|                text|
+--------------------+
|RT @votetimscott:...|
|@RandPaul https:/...|
|@RandPaul Also, m...|
|RT @VP: There is ...|
|RT @POTUS: Folks,...|
|RT @JoeBiden: The...|
|RT @POTUS: Here’s...|
|RT @SenSchumer: W...|
|@JoeBiden Oh pleA...|
|RT @amyklobuchar:...|
|      @JoeBiden Nope|
|@SenWarren 70% ? ...|
|              @POTUS|
|@POTUS Now do gas...|
|RT @RepDavidKusto...|
|@POTUS We'd have ...|
|@RandPaul The law...|
|RT @POTUS: Here’s...|
|@POTUS I think yo...|
|@POTUS I think yo...|
+--------------------+
only showing top 20 rows



'Lenght: 3727'

Removing special chacaters and changing to lowercase

In [7]:
df = df.withColumn("text", f.lower(f.regexp_replace(f.col("text"), "[^A-Za-z0-9@ ]", "")))
df.show()

+--------------------+
|                text|
+--------------------+
|rt @votetimscott ...|
|@randpaul httpstc...|
|@randpaul also ma...|
|rt @vp there is a...|
|rt @potus folks i...|
|rt @joebiden the ...|
|rt @potus heres w...|
|rt @senschumer wi...|
|@joebiden oh plea...|
|rt @amyklobuchar ...|
|      @joebiden nope|
|@senwarren 70    ...|
|              @potus|
|@potus now do gas...|
|rt @repdavidkusto...|
|@potus wed have m...|
|@randpaul the law...|
|rt @potus heres w...|
|@potus i think yo...|
|@potus i think yo...|
+--------------------+
only showing top 20 rows



# 2. LDA - topics analysis - Pyspark only

In [8]:
tokenizer = pysparkTokenizer(inputCol="text", outputCol="tokens")
stopwords_cleaner = StopWordsRemover(inputCol="tokens", outputCol="no stop words")
nlp_pipeline = Pipeline(
    stages=[tokenizer,
            stopwords_cleaner])
nlp_model = nlp_pipeline.fit(df)
processed_df  = nlp_model.transform(df)
processed_df.show(5)
processed_df.limit(3).toPandas().to_dict("records")

+--------------------+--------------------+--------------------+
|                text|              tokens|       no stop words|
+--------------------+--------------------+--------------------+
|rt @votetimscott ...|[rt, @votetimscot...|[rt, @votetimscot...|
|@randpaul httpstc...|[@randpaul, https...|[@randpaul, https...|
|@randpaul also ma...|[@randpaul, also,...|[@randpaul, also,...|
|rt @vp there is a...|[rt, @vp, there, ...|[rt, @vp, lot, pa...|
|rt @potus folks i...|[rt, @potus, folk...|[rt, @potus, folk...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



[{'text': 'rt @votetimscott one of the highlights of my year was giving the official @gop response to joe bidens first address to congress i said i',
  'tokens': ['rt',
   '@votetimscott',
   'one',
   'of',
   'the',
   'highlights',
   'of',
   'my',
   'year',
   'was',
   'giving',
   'the',
   'official',
   '@gop',
   'response',
   'to',
   'joe',
   'bidens',
   'first',
   'address',
   'to',
   'congress',
   'i',
   'said',
   'i'],
  'no stop words': ['rt',
   '@votetimscott',
   'one',
   'highlights',
   'year',
   'giving',
   'official',
   '@gop',
   'response',
   'joe',
   'bidens',
   'first',
   'address',
   'congress',
   'said']},
 {'text': '@randpaul httpstco3leg5o0bp0',
  'tokens': ['@randpaul', 'httpstco3leg5o0bp0'],
  'no stop words': ['@randpaul', 'httpstco3leg5o0bp0']},
 {'text': '@randpaul also make voting as hard as possible  reduce poll station number early voting hours and staff at regul httpstcoitmsnlaytr',
  'tokens': ['@randpaul',
   'also',
   'mak

In [9]:
cv = CountVectorizer(inputCol="no stop words", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['', 'new', '6', '@potus', 'rt', 'unemployment', 'president', 'jobs', 'one', 'year', 'million', 'stand', 'heres', 'record', 'nearly'] 

['@potus', 'rt', 'back', 'federal', 'state', 'plan', 'administration', 'every', 'covid19', 'last', 'week', 'people', 'governor', 'fighting', 'covid'] 

['@randpaul', '@vp', '@joebiden', 'rt', 'vaccine', 'mandates', 'vaccines', 'dont', 'get', 'obvious', 'ulterior', 'motive', 'seem', 'opposing', 'opposition'] 

['rt', '', '@potus', 'care', 'booster', 'high', 'increase', 'protection', 'covid19', 'shots', 'illness', 'severe', 'degree', 'antibody', 'levels'] 

['rt', '@joebiden', 'way', 'best', 'year', 'remember', 'protect', 'continue', 'new', 'health', 'head', 'holiday', 'celebrations', 'kee', '@senwarren'] 



# 3. LDA - topics analysis - SparkNLP

In [10]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol('document')
tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized')
lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized'])\
.setOutputCol('unigrams').setStopWords(stopwords.words('english'))
pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'unigrams']).setOutputCol('pos')
chunker = Chunker().setInputCols(['document', 'pos']).setOutputCol('ngrams').setRegexParsers(['<JJ>+<NN>', '<NN>+<NN>'])
finisher = Finisher().setInputCols(['unigrams', 'ngrams'])
pipeline = Pipeline() \
     .setStages([documentAssembler,
                 tokenizer,
                 normalizer,
                 lemmatizer,
                 stopwords_cleaner,
                 pos_tagger,
                 chunker,
                 finisher])
processed_df = pipeline.fit(df).transform(df).withColumn("final", f.concat("finished_unigrams", "finished_ngrams"))
processed_df.show()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
+--------------------+--------------------+--------------------+--------------------+
|                text|   finished_unigrams|     finished_ngrams|               final|
+--------------------+--------------------+--------------------+--------------------+
|rt @votetimscott ...|[rt, votetimscott...|[official @go, hi...|[rt, votetimscott...|
|@randpaul httpstc...|[randpaul, httpst...|[@randpaul httpst...|[randpaul, httpst...|
|@randpaul also ma...|[randpaul, also, ...|[hard as possible...|[randpaul, also, ...|
|rt @vp there is a...|[rt, vp, lot, pan...|[pandemic that is...|[rt, vp, lot, pan...|
|rt @potus folks i...|[rt, potus, folk,...|[crumbling roads,...|[rt, potus, folk,...|
|rt @joebiden the ...|[rt, joebiden, va...|[valuable tool, l...|[rt, joebiden, va...|
|rt @potus heres w...|[rt, potus, s

In [11]:
cv = CountVectorizer(inputCol="final", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['joebiden', 'good', 'rt', 'way', 'year', 'continue', 'protect', 'remember', 'new', 'best way', 'head', 'health', 'holiday', 'new year', 'celebration'] 

['new', 'rt', 'potus', 'president', 'job', 'one', 'year', 'million', 'stand', 'record', 'nearly', 'unemployment', 'new jobs', 'new president', '@potus heres where we stand'] 

['potus', 'vp', 'get', 'rt', 'tedcruz', 'people', 'joebiden', 'need', 'think', 'go', 'help', 'biden', 'everyone', 'amp', 'dont'] 

['rt', 'randpaul', 'senwarren', 'vote', 'vaccine', 'want', 'would', 'american', 'court', 'care', 'radical', 'law', 'ready', 'joebiden', 'supreme'] 

['potus', 'covid', 'rt', 'back', 'federal', 'state', 'plan', 'administration', 'every', 'last', 'week', 'fight', 'governor', 'federal plan', 'see'] 

