# 0. Libraries and spark session

In [1]:
pip install spark-nlp==3.3.4

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install numpy pandas nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
spark.stop()
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()
spark

In [4]:
from sparknlp.annotator import LemmatizerModel
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer as pysparkTokenizer, HashingTF, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, PerceptronModel, Chunker
from pyspark.ml.clustering import LDA
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1. Loading the data

In [5]:
df = spark.read.json("/home/ubuntu/temp-data")
df.repartition(50).write.mode("overwrite").parquet("/home/ubuntu/temp-data2")
df = spark.read.parquet("/home/ubuntu/temp-data2")
# df = spark.read.option("tableName", "tweets").format("dynamodb").load()
df.show()
f"Lenght: {df.count()}"

+--------------+----+-------------------+----+--------------------+---------------+
|extractionTime| geo|                 id|lang|                text|           user|
+--------------+----+-------------------+----+--------------------+---------------+
|    1639926916|null|1472586335487672330|  en|@Sen_JoeManchin ?...|    MikeyOZPK15|
|    1639929571|null|1472597472530681860|  en|@Sen_JoeManchin ?...| mindfulliving1|
|    1639927432|null|1472588501753503751|  en|RT @TheDemocrats:...|Dshe51arerDiana|
|    1639925273|null|1472579445177389059|  en|RT @SenSchumer: I...|MaryPer46923539|
|    1639929482|null|1472597099984429056|  en|RT @TheDemocrats:...|  Angelstrenght|
|    1639924524|null|1472576304566263808|  en|@SenWarren or Poc...| marinecorp7375|
|    1639926337|null|1472583908558573571|  en|RT @TheDemocrats:...|      Diditrek1|
|    1639927868|null|1472590328863641606|  en|RT @ChuckGrassley...| MysteryMind117|
|    1639927508|null|1472588821019738114|  en|RT @ChuckGrassley...|  DeezInD

'Lenght: 12051'

Removing special chacaters and changing to lowercase

In [6]:
df = df.withColumn("text", f.lower(f.regexp_replace(f.col("text"), "[^A-Za-z0-9@ ]", "")))
df.show()

+--------------+----+-------------------+----+--------------------+---------------+
|extractionTime| geo|                 id|lang|                text|           user|
+--------------+----+-------------------+----+--------------------+---------------+
|    1639926916|null|1472586335487672330|  en|@senjoemanchin on...|    MikeyOZPK15|
|    1639929571|null|1472597472530681860|  en|@senjoemanchin  w...| mindfulliving1|
|    1639927432|null|1472588501753503751|  en|rt @thedemocrats ...|Dshe51arerDiana|
|    1639925273|null|1472579445177389059|  en|rt @senschumer im...|MaryPer46923539|
|    1639929482|null|1472597099984429056|  en|rt @thedemocrats ...|  Angelstrenght|
|    1639924524|null|1472576304566263808|  en|@senwarren or poc...| marinecorp7375|
|    1639926337|null|1472583908558573571|  en|rt @thedemocrats ...|      Diditrek1|
|    1639927868|null|1472590328863641606|  en|rt @chuckgrassley...| MysteryMind117|
|    1639927508|null|1472588821019738114|  en|rt @chuckgrassley...|  DeezInD

# 2. LDA - topics analysis - Pyspark only

In [7]:
tokenizer = pysparkTokenizer(inputCol="text", outputCol="tokens")
stopwords_cleaner = StopWordsRemover(inputCol="tokens", outputCol="no stop words")
nlp_pipeline = Pipeline(
    stages=[tokenizer,
            stopwords_cleaner])
nlp_model = nlp_pipeline.fit(df)
processed_df  = nlp_model.transform(df)
processed_df.show(5)
processed_df.limit(3).toPandas().to_dict("records")

+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+
|extractionTime| geo|                 id|lang|                text|           user|              tokens|       no stop words|
+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+
|    1639926916|null|1472586335487672330|  en|@senjoemanchin on...|    MikeyOZPK15|[@senjoemanchin, ...|[@senjoemanchin, ...|
|    1639929571|null|1472597472530681860|  en|@senjoemanchin  w...| mindfulliving1|[@senjoemanchin, ...|[@senjoemanchin, ...|
|    1639927432|null|1472588501753503751|  en|rt @thedemocrats ...|Dshe51arerDiana|[rt, @thedemocrat...|[rt, @thedemocrat...|
|    1639925273|null|1472579445177389059|  en|rt @senschumer im...|MaryPer46923539|[rt, @senschumer,...|[rt, @senschumer,...|
|    1639929482|null|1472597099984429056|  en|rt @thedemocrats ...|  Angelstrenght|[rt, @thedemocrat...|[rt, @thedemoc

[{'extractionTime': 1639926916,
  'geo': None,
  'id': 1472586335487672330,
  'lang': 'en',
  'text': '@senjoemanchin only democrat with a set of balls',
  'user': 'MikeyOZPK15',
  'tokens': ['@senjoemanchin',
   'only',
   'democrat',
   'with',
   'a',
   'set',
   'of',
   'balls'],
  'no stop words': ['@senjoemanchin', 'democrat', 'set', 'balls']},
 {'extractionTime': 1639929571,
  'geo': None,
  'id': 1472597472530681860,
  'lang': 'en',
  'text': '@senjoemanchin  we thank you  we need you  we will never forget  manofthehour americanhero',
  'user': 'mindfulliving1',
  'tokens': ['@senjoemanchin',
   '',
   'we',
   'thank',
   'you',
   '',
   'we',
   'need',
   'you',
   '',
   'we',
   'will',
   'never',
   'forget',
   '',
   'manofthehour',
   'americanhero'],
  'no stop words': ['@senjoemanchin',
   '',
   'thank',
   '',
   'need',
   '',
   'never',
   'forget',
   '',
   'manofthehour',
   'americanhero']},
 {'extractionTime': 1639927432,
  'geo': None,
  'id': 14725885

In [8]:
cv = CountVectorizer(inputCol="no stop words", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['@potus', '@senschumer', 'rt', 'president', 'act', 'must', '@thedemocrats', 'voting', 'rights', 'vote', 'pass', '@vp', '@amyklobuchar', 'freedom', 'john'] 

['', '@tedcruz', 'rt', 'mask', 'party', 'democrat', 'wear', 'must', 'official', 'forever', 'positioneverybody', '@kamalaharris', '@chrismurphyct', 'time', 'last'] 

['rt', 'back', 'better', 'build', '@kamalaharris', 'senate', 'republicans', 'want', 'going', '@chrismurphyct', 'congress', 'cruz', 'lets', 'lower', 'costs'] 

['@senjoemanchin', '', 'manchin', '@lindseygrahamsc', 'joe', 'rt', 'bbb', 'people', 'vote', 'senator', 'youre', 'dont', 'republican', 'bill', 'amp'] 

['@joebiden', 'get', '@senjoemanchin', 'people', 'rt', '@sentedcruz', '@senwarren', 'covid', 'way', 'biden', 'american', 'fox', 'youre', 'think', 'yet'] 



# 3. LDA - topics analysis - SparkNLP

In [9]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol('document')
tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized')
lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized'])\
.setOutputCol('unigrams').setStopWords(stopwords.words('english'))
pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'unigrams']).setOutputCol('pos')
chunker = Chunker().setInputCols(['document', 'pos']).setOutputCol('ngrams').setRegexParsers(['<JJ>+<NN>', '<NN>+<NN>'])
finisher = Finisher().setInputCols(['unigrams', 'ngrams'])
pipeline = Pipeline() \
     .setStages([documentAssembler,
                 tokenizer,
                 normalizer,
                 lemmatizer,
                 stopwords_cleaner,
                 pos_tagger,
                 chunker,
                 finisher])
processed_df = pipeline.fit(df).transform(df).withColumn("final", f.concat("finished_unigrams", "finished_ngrams"))
processed_df.show()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+--------------------+
|extractionTime| geo|                 id|lang|                text|           user|   finished_unigrams|     finished_ngrams|               final|
+--------------+----+-------------------+----+--------------------+---------------+--------------------+--------------------+--------------------+
|    1639926916|null|1472586335487672330|  en|@senjoemanchin on...|    MikeyOZPK15|[senjoemanchin, d...|[@senjoemanchin o...|[senjoemanchin, d...|
|    1639929571|null|1472597472530681860|  en|@senjoemanchin  w...| mindfulliving1|[senjoemanchin, t...|[manofthehour ame...|[senjoemanchin, t...|
|    1639927432|null|1472588501753503751|  en|rt @thedemocrats ...|Dshe51a

In [10]:
cv = CountVectorizer(inputCol="final", outputCol="features", vocabSize=500, minDF=3.0)
cv_model = cv.fit(processed_df)
lda = LDA(k=5, maxIter=100)
model = lda.fit(cv_model.transform(processed_df))
for indices in model.describeTopics(15).select("termIndices").rdd.flatMap(lambda x: x).collect():
    print([cv_model.vocabulary[i] for i in indices], '\n')

['tedcruz', 'rt', 'mask', 'democrat', 'party', 'wear', 'official', 'must', 'forever', 'positioneverybody', '@tedcruz  this is now the official democrat party positioneverybody', 'kamalaharris', 'well', 'one', 'cost'] 

['rt', 'potus', 'senschumer', 'covid', 'time', 'amyklobuchar', 'th', 'federal', 'month', 'word', 'good', 'vaccine', 'last', 'school', 'chrismurphyct'] 

['lindseygrahamsc', 'let', 'senate', 'tell', 'go', 'chrismurphyct', 'rt', 'cruz', 'hold', 'agree', 'stay', 'clear', 'schumer', 'lift', 'session'] 

['senjoemanchin', 'potus', 'people', 'joebiden', 'youre', 'rt', 'dont', 'one', 'vp', 'get', 'thank', 'manchin', 'american', 'like', 'care'] 

['senjoemanchin', 'vote', 'right', 'bbb', 'back', 'act', 'well', 'president', 'must', 'rt', 'get', 'joe', 'thedemocrats', 'manchin', 'say'] 

