In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.sql import types as T
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

In [2]:
import sparknlp
spark = sparknlp.start()

In [3]:
# conf = SparkConf().set("spark.jars", "./spark-nlp_2.11-2.4.5.jar")

In [4]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [5]:
# sc = SparkContext(conf=conf)

In [6]:
# spark = SparkSession.builder\
#     .appName('NLP model')\
#     .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
#     .getOrCreate()

In [7]:
spark

In [8]:
sparknlp.version()

'2.4.5'

In [9]:
df = spark.read\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .csv("tweets", header=True)

In [10]:
# df.limit(10).toPandas()

Unnamed: 0,status_id,user_id,created_at,text,lang
0,1238253442063310848,532343475,2020-03-13T00:00:00Z,The UFC is about to be the most popular sport ...,en
1,1238253441778098177,165879150,2020-03-13T00:00:00Z,The great toilet paper depression of 2020 #Toi...,en
2,1238253440486313988,569242704,2020-03-13T00:00:00Z,The 'Spotlight Show' with @janeyleegrace on @u...,en
3,1238253439051870208,16368021,2020-03-13T00:00:00Z,Because we all the time in the world right? @s...,en
4,1238253440821649408,1057148786189824000,2020-03-13T00:00:00Z,French pastry chef shows off Easter eggs model...,en
5,1238253442034020354,1093544067219292161,2020-03-13T00:00:00Z,ICYMI - Hour 2 of #TheGamePlan with @DaveWNSP ...,en
6,1238253441564266496,39743812,2020-03-13T00:00:00Z,"With rising #Coronavirus cases in India, which...",en
7,1238253441517928448,17852186,2020-03-13T00:00:00Z,#ICYMI: #Ontario #MPPs may temporarily suspend...,en
8,1238253440603541504,1454687180,2020-03-13T00:00:00Z,Despite having only 3 confirmed #coronavirus c...,en
9,1238253440461135873,19047089,2020-03-13T00:00:00Z,Autonomous #Robots Are Helping Kill #Coronavir...,en


In [11]:
document_assembler = DocumentAssembler() \
    .setInputCol("text")

# sentence_detector = SentenceDetector() \
#     .setInputCols(["document"]) \
#     .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normal")\
    .setCleanupPatterns(['[^A-Za-z]', 'http.*'])\
    .setLowercase(True)

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["normal"]) \
    .setOutputCol("clean") \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# lemmatizer = LemmatizerModel.pretrained() \
#     .setInputCols(["clean"]) \
#     .setOutputCol("lemma")

lemmatizer=LemmatizerModel.load('lemma_antbnc_en_2.0.2_2.4_1556480454569/')\
    .setInputCols(["clean"]) \
    .setOutputCol("lemma")

# stemmer = Stemmer() \
#     .setInputCols(["clean_lemma"]) \
#     .setOutputCol("stem")

# word_embeddings=ElmoEmbeddings.pretrained()\
#     .setInputCols(["document", "lemma"])\
#     .setOutputCol("embedding")

word_embeddings=ElmoEmbeddings.load('elmo_en_2.4.0_2.4_1580488815299/')\
    .setInputCols(["document", "lemma"])\
    .setOutputCol("embedding")


sentence_embeddings=SentenceEmbeddings()\
    .setInputCols(["document", "embedding"])\
    .setOutputCol("sentence_embedding")\
    .setPoolingStrategy("AVERAGE")


# finisher = Finisher() \
#     .setInputCols(["sentence_embedding"]) \
#     .setOutputCols(['finished'])\
#     .setIncludeMetadata(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [541]:
pipeline = Pipeline(stages=[
    document_assembler, 
#     sentence_detector, 
    tokenizer, 
    normalizer, 
    stopwords_cleaner,
    lemmatizer,
    word_embeddings,
    sentence_embeddings,
#     finisher
])

In [None]:
result = pipeline.fit(df).transform(df)

In [542]:
result = result.select(
        'status_id',
        'created_at',
        'text',
        F.explode("sentence_embedding").alias("sentence_embedding")
      )\
    .select(
        'status_id',
        'created_at',
        F.from_unixtime(F.unix_timestamp('created_at', 'yyyy-MM-dd')).cast('timestamp').alias('date'),
        'text',
        F.col("sentence_embedding.embeddings").alias('embedding')
   )

In [None]:
result.write\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .mode("overwrite")\
    .parquet("embeddings.parquet")

In [None]:
df_by_date = result.groupBy('date')\
    .agg(
        F.array(*[F.avg(F.col("embedding")[i]) for i in range(512)]).alias("embedding"),
        F.count('*').alias('number_of_tweets')
    )

In [543]:
df_by_date.write\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .mode("overwrite")\
    .parquet("nlp_features.parquet")