# 0. Libraries and spark session

In [1]:
pip install spark-nlp==3.3.4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy pandas nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
spark.stop()
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()
spark

In [4]:
from sparknlp.annotator import LemmatizerModel
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer as pysparkTokenizer, HashingTF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, PerceptronModel, Chunker
from pyspark.ml.clustering import LDA
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1. Loading the data

In [6]:
df = spark.read.json("/home/ubuntu/temp-data").drop_duplicates(["text"])
df.repartition(50).write.mode("overwrite").parquet("/home/ubuntu/temp-data2")
df = spark.read.parquet("/home/ubuntu/temp-data2")
# df = spark.read.option("tableName", "tweets").format("dynamodb").load()
df.show()
f"Lenght: {df.count()}"

+--------------+----+-------------------+----+--------------------+---------------+
|extractionTime| geo|                 id|lang|                text|           user|
+--------------+----+-------------------+----+--------------------+---------------+
|    1639928596|null|1472593384531689478|  en|@LindseyGrahamSC ...|    GreatLenny1|
|    1639928987|null|1472595022021337097|  en|@Sen_JoeManchin B...|      AltickRob|
|    1639927784|null|1472589977263513604|  en|@Sen_JoeManchin @...|realnewsrealnfo|
|    1639929352|null|1472596551369469964|  en|@SenSchumer @POTU...|        THE_ECB|
|    1639926510|null|1472584634005344266|  en|@SenBlumenthal Fo...|GrinningSinner9|
|    1639929167|null|1472595778342375428|  en|@Sen_JoeManchin T...|     Hoganknows|
|    1639927804|null|1472590058683306006|  en|@POTUS Lol as of ...|  Dylan19135270|
|    1639926726|null|1472585539224227848|  en|@Sen_JoeManchin  ...|      lynrouse1|
|    1640627214|null|1475523600253952000|  en|@KamalaHarris Let...| HTX_Grou

'Lenght: 16653'

Removing special chacaters and changing to lowercase

In [7]:
df = df.withColumn("text", f.lower(f.regexp_replace(f.col("text"), "[^A-Za-z0-9@ ]", "")))
df.show()

+--------------+----+-------------------+----+--------------------+---------------+
|extractionTime| geo|                 id|lang|                text|           user|
+--------------+----+-------------------+----+--------------------+---------------+
|    1639928596|null|1472593384531689478|  en|@lindseygrahamsc ...|    GreatLenny1|
|    1639928987|null|1472595022021337097|  en|@senjoemanchin bu...|      AltickRob|
|    1639927784|null|1472589977263513604|  en|@senjoemanchin @s...|realnewsrealnfo|
|    1639929352|null|1472596551369469964|  en|@senschumer @potu...|        THE_ECB|
|    1639926510|null|1472584634005344266|  en|@senblumenthal fo...|GrinningSinner9|
|    1639929167|null|1472595778342375428|  en|@senjoemanchin th...|     Hoganknows|
|    1639927804|null|1472590058683306006|  en|@potus lol as of ...|  Dylan19135270|
|    1639926726|null|1472585539224227848|  en|@senjoemanchin  u...|      lynrouse1|
|    1640627214|null|1475523600253952000|  en|@kamalaharris let...| HTX_Grou

# 2. LDA - topics analysis - Pyspark only

In [8]:
tokenizer = pysparkTokenizer(inputCol="text", outputCol="tokens")
stopwords_cleaner = StopWordsRemover(inputCol="tokens", outputCol="no stop words")
nlp_pipeline = Pipeline(
    stages=[tokenizer,
            stopwords_cleaner])
nlp_model = nlp_pipeline.fit(df)
processed_df  = nlp_model.transform(df)
processed_df.show(5)
processed_df.limit(3).toPandas().to_dict("records")

+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+
|extractionTime| geo|                 id|lang|                text|           user|              tokens|       no stop words|
+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+
|    1639928596|null|1472593384531689478|  en|@lindseygrahamsc ...|    GreatLenny1|[@lindseygrahamsc...|[@lindseygrahamsc...|
|    1639928987|null|1472595022021337097|  en|@senjoemanchin bu...|      AltickRob|[@senjoemanchin, ...|[@senjoemanchin, ...|
|    1639927784|null|1472589977263513604|  en|@senjoemanchin @s...|realnewsrealnfo|[@senjoemanchin, ...|[@senjoemanchin, ...|
|    1639929352|null|1472596551369469964|  en|@senschumer @potu...|        THE_ECB|[@senschumer, @po...|[@senschumer, @po...|
|    1639926510|null|1472584634005344266|  en|@senblumenthal fo...|GrinningSinner9|[@senblumenthal, ...|[@senblumentha

[{'extractionTime': 1639928596,
  'geo': None,
  'id': 1472593384531689478,
  'lang': 'en',
  'text': '@lindseygrahamsc so glad to have a sane competent responsible responsive potus and administration in power now aft httpstcomwtejuxqsi',
  'user': 'GreatLenny1',
  'tokens': ['@lindseygrahamsc',
   'so',
   'glad',
   'to',
   'have',
   'a',
   'sane',
   'competent',
   'responsible',
   'responsive',
   'potus',
   'and',
   'administration',
   'in',
   'power',
   'now',
   'aft',
   'httpstcomwtejuxqsi'],
  'no stop words': ['@lindseygrahamsc',
   'glad',
   'sane',
   'competent',
   'responsible',
   'responsive',
   'potus',
   'administration',
   'power',
   'aft',
   'httpstcomwtejuxqsi']},
 {'extractionTime': 1639928987,
  'geo': None,
  'id': 1472595022021337097,
  'lang': 'en',
  'text': '@senjoemanchin but you cant help the people of wv by voting for the build back better deal why because it hurts httpstcoorbi5zgwdh',
  'user': 'AltickRob',
  'tokens': ['@senjoemanchin'

In [9]:
cv = CountVectorizer(inputCol="no stop words", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['@tedcruz', 'rt', '@barackobama', '@randpaul', 'ted', 'people', '@chuckgrassley', '@chrismurphyct', 'christmas', 'dont', 'thats', 'one', 'cruz', 'states', 'covid'] 

['@potus', '', 'money', 'dont', 'go', 'inflation', 'need', '@joebiden', 'time', 'know', 'americans', 'economy', '@vp', 'biden', 'people'] 

['back', '@lindseygrahamsc', '@sentedcruz', '@amyklobuchar', '@foxnews', 'voting', 'better', 'rights', 'build', 'need', 'every', '@sensherrodbrown', '@senronjohnson', 'bill', 'get'] 

['@senjoemanchin', '', 'youre', 'people', 'manchin', 'joe', 'bbb', 'vote', 'amp', '@senwarren', 'party', 'thank', 'american', 'republican', 'west'] 

['@joebiden', '@senschumer', '@kamalaharris', '@vp', 'president', '@thedemocrats', 'biden', 'hes', 'still', 'senate', '@johncornyn', 'going', 'anything', 'get', '@speakerpelosi'] 



# 3. LDA - topics analysis - SparkNLP

In [10]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol('document')
tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized')
lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized'])\
.setOutputCol('unigrams').setStopWords(stopwords.words('english'))
pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'unigrams']).setOutputCol('pos')
chunker = Chunker().setInputCols(['document', 'pos']).setOutputCol('ngrams').setRegexParsers(['<JJ>+<NN>', '<NN>+<NN>'])
finisher = Finisher().setInputCols(['unigrams', 'ngrams'])
pipeline = Pipeline() \
     .setStages([documentAssembler,
                 tokenizer,
                 normalizer,
                 lemmatizer,
                 stopwords_cleaner,
                 pos_tagger,
                 chunker,
                 finisher])
processed_df = pipeline.fit(df).transform(df).withColumn("final", f.concat("finished_unigrams", "finished_ngrams"))
processed_df.show()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+--------------------+
|extractionTime| geo|                 id|lang|                text|           user|   finished_unigrams|     finished_ngrams|               final|
+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+--------------------+
|    1639928596|null|1472593384531689478|  en|@lindseygrahamsc ...|    GreatLenny1|[lindseygrahamsc,...|[glad to have a s...|[lindseygrahamsc,...|
|    1639928987|null|1472595022021337097|  en|@senjoemanchin bu...|      AltickRob|[senjoemanchin, c...|[wv by voting, @s...|[senjoemanchin, c...|
|    1639927784|null|1472589977263513604|  en|@senjoemanchin @s...|realnew

In [11]:
cv = CountVectorizer(inputCol="final", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['vote', 'right', 'people', 'potus', 'amyklobuchar', 'get', 'make', 'tax', 'senwarren', 'dont', 'want', 'one', 'rt', 'student', 'pay'] 

['potus', 'joebiden', 'vp', 'lie', 'president', 'go', 'biden', 'lindseygrahamsc', 'senwarren', 'stop', 'trump', 'inflation', 'liar', 'kamalaharris', 'please'] 

['tedcruz', 'senschumer', 'sentedcruz', 'barackobama', 'party', 'foxnews', 'ted', 'joemanchinwv', 'cruz', 'mask', 'gop', 'republican', 'democratic', 'marcorubio', 'senate'] 

['senjoemanchin', 'manchin', 'joe', 'people', 'youre', 'vote', 'care', 'thank', 'bbb', 'republican', 'money', 'state', 'american', 'dont', 'west'] 

['get', 'back', 'senjoemanchin', 'rt', 'well', 'need', 'kamalaharris', 'bill', 'build', 'thedemocrats', 'shit', 'go', 'joebiden', 'fox', 'randpaul'] 

