In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.sql import types as T
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

In [2]:
import sparknlp
spark = sparknlp.start()

In [3]:
# conf = SparkConf().set("spark.jars", "./spark-nlp_2.11-2.4.5.jar")

In [4]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [7]:
spark

In [8]:
sparknlp.version()

'2.4.5'

In [9]:
df = spark.read\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .csv("tweets", header=True)

In [10]:
df.limit(10).toPandas()

Unnamed: 0,status_id,user_id,created_at,text,lang
0,1238253442063310848,532343475,2020-03-13T00:00:00Z,The UFC is about to be the most popular sport ...,en
1,1238253441778098177,165879150,2020-03-13T00:00:00Z,The great toilet paper depression of 2020 #Toi...,en
2,1238253440486313988,569242704,2020-03-13T00:00:00Z,The 'Spotlight Show' with @janeyleegrace on @u...,en
3,1238253439051870208,16368021,2020-03-13T00:00:00Z,Because we all the time in the world right? @s...,en
4,1238253440821649408,1057148786189824000,2020-03-13T00:00:00Z,French pastry chef shows off Easter eggs model...,en
5,1238253442034020354,1093544067219292161,2020-03-13T00:00:00Z,ICYMI - Hour 2 of #TheGamePlan with @DaveWNSP ...,en
6,1238253441564266496,39743812,2020-03-13T00:00:00Z,"With rising #Coronavirus cases in India, which...",en
7,1238253441517928448,17852186,2020-03-13T00:00:00Z,#ICYMI: #Ontario #MPPs may temporarily suspend...,en
8,1238253440603541504,1454687180,2020-03-13T00:00:00Z,Despite having only 3 confirmed #coronavirus c...,en
9,1238253440461135873,19047089,2020-03-13T00:00:00Z,Autonomous #Robots Are Helping Kill #Coronavir...,en


In [37]:
document_assembler = DocumentAssembler() \
    .setInputCol("text")

# sentence_detector = SentenceDetector() \
#     .setInputCols(["document"]) \
#     .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normal")\
    .setCleanupPatterns(['[^A-Za-z]', 'http.*'])\
    .setLowercase(True)

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["normal"]) \
    .setOutputCol("clean") \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# lemmatizer = LemmatizerModel.pretrained() \
#     .setInputCols(["clean"]) \
#     .setOutputCol("lemma")

lemmatizer=LemmatizerModel.load('lemma_antbnc_en_2.0.2_2.4_1556480454569/')\
    .setInputCols(["clean"]) \
    .setOutputCol("lemma")

stemmer = Stemmer() \
    .setInputCols(["lemma"]) \
    .setOutputCol("stem")

# word_embeddings=ElmoEmbeddings.pretrained()\
#     .setInputCols(["document", "lemma"])\
#     .setOutputCol("embedding")

# word_embeddings=ElmoEmbeddings.load('elmo_en_2.4.0_2.4_1580488815299/')\
#     .setInputCols(["document", "lemma"])\
#     .setOutputCol("embedding")


# sentence_embeddings=SentenceEmbeddings()\
#     .setInputCols(["document", "embedding"])\
#     .setOutputCol("sentence_embedding")\
#     .setPoolingStrategy("AVERAGE")


ngrams_cum = NGramGenerator() \
    .setInputCols(["stem"]) \
    .setOutputCol("ngram") \
    .setN(2) \
    .setEnableCumulative(True)\
    .setDelimiter("_") # Default is space


finisher = Finisher() \
    .setInputCols(["ngram"]) \
    .setOutputCols(['tokens'])\

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [38]:
pipeline = Pipeline(stages=[
    document_assembler, 
#     sentence_detector, 
    tokenizer, 
    normalizer, 
    stopwords_cleaner,
    lemmatizer,
    stemmer,
#     word_embeddings,
#     sentence_embeddings,
    ngrams_cum,
    finisher
])

In [39]:
result = pipeline.fit(df).transform(df)

In [40]:
result = result.select(
        'status_id',
        'created_at',
        F.from_unixtime(F.unix_timestamp('created_at', 'yyyy-MM-dd')).cast('timestamp').alias('date'),
        F.explode("tokens").alias("word")
        )

In [41]:
result = result.groupBy(['word', 'date'])\
    .agg(F.countDistinct("status_id").alias("count"))

In [42]:
result_df = result.toPandas()
result_df

Unnamed: 0,word,date,count
0,ufc,2020-03-13,1
1,popular,2020-03-13,1
2,sport,2020-03-13,15
3,entir,2020-03-13,1
4,world,2020-03-13,17
...,...,...,...
6636,univers_amp,2020-03-13,1
6637,self,2020-03-13,2
6638,perhap,2020-03-13,1
6639,face_disciplin,2020-03-13,1


In [110]:
result.write\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .mode("overwrite")\
    .parquet("word_count.parquet")

In [None]:
result = spark.read\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .parquet("word_count.parquet")

In [119]:
result2 = result.groupby('word')\
    .agg(F.sum('count').alias('count'))\
    .orderBy(F.desc('count'))

In [120]:
words = [row[0] for row in result2.limit(1000).select('word').collect()]
if 'date' in words:
    words.remove('date')

In [121]:
result3 = result.groupBy('date').pivot('word', words).max('count').orderBy('date').fillna(0)

In [122]:
result3.toPandas()

Unnamed: 0,date,coronaviru,covid,coronaviruspandem,peopl,get,go,ne,time,spread,...,french,egg,pastri_chef,easteregg_coronaviru,ncaatourna,hour_thegameplan,nfl_ncaa,railwai,rise_coronaviru,effort_bid
0,2020-03-13,239,143,77,32,29,27,24,24,23,...,1,1,1,1,1,1,1,1,1,1


In [123]:
df_cases = spark.read.csv('covid19.csv', header=True)

In [135]:
df_cases = df_cases.filter(F.col('prname') == 'Canada')\
    .select(
        F.from_unixtime(F.unix_timestamp('date', 'dd-MM-yyyy')).cast('timestamp').alias('date'),
        F.col('numconf').cast('Long'),
    )

In [136]:
result4 = result3.join(df_cases, on='date', how='inner')

In [137]:
result4.stat.corr("coronaviru", "numconf")

nan

In [140]:
sc = SparkContext.getOrCreate()

In [148]:
result4.select("coronaviru").collect()

[Row(coronaviru=239)]

In [150]:
sc.parallelize(words).map(lambda w: result4.select(w).toPandas()).collect()

Traceback (most recent call last):
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/site-packages/pyspark/serializers.py", line 597, in dumps
    return cloudpickle.dumps(obj, 2)
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/site-packages/pyspark/cloudpickle.py", line 863, in dumps
    cp.dump(obj)
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/site-packages/pyspark/cloudpickle.py", line 260, in dump
    return Pickler.dump(self, obj)
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/pickle.py", line 437, in dump
    self.save(obj)
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/pickle.py", line 789, in save_tuple
    save(element)
  File "/Users/tingwei758/opt/anaconda3/envs/pyspark/lib/python3.7/pickle.py", line 504, in save
    f(self, o

PicklingError: Could not serialize object: Py4JError: An error occurred while calling o49060.__getstate__. Trace:
py4j.Py4JException: Method __getstate__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

