In [10]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [11]:
spark = SparkSession.builder\
    .appName('Tweets Word Count')\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .getOrCreate()

In [12]:
spark

In [13]:
df = spark.read\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .csv("tweets", header=True)

In [10]:
# df.limit(10).toPandas()

Unnamed: 0,status_id,user_id,created_at,text,lang
0,1238253442063310848,532343475,2020-03-13T00:00:00Z,The UFC is about to be the most popular sport ...,en
1,1238253441778098177,165879150,2020-03-13T00:00:00Z,The great toilet paper depression of 2020 #Toi...,en
2,1238253440486313988,569242704,2020-03-13T00:00:00Z,The 'Spotlight Show' with @janeyleegrace on @u...,en
3,1238253439051870208,16368021,2020-03-13T00:00:00Z,Because we all the time in the world right? @s...,en
4,1238253440821649408,1057148786189824000,2020-03-13T00:00:00Z,French pastry chef shows off Easter eggs model...,en
5,1238253442034020354,1093544067219292161,2020-03-13T00:00:00Z,ICYMI - Hour 2 of #TheGamePlan with @DaveWNSP ...,en
6,1238253441564266496,39743812,2020-03-13T00:00:00Z,"With rising #Coronavirus cases in India, which...",en
7,1238253441517928448,17852186,2020-03-13T00:00:00Z,#ICYMI: #Ontario #MPPs may temporarily suspend...,en
8,1238253440603541504,1454687180,2020-03-13T00:00:00Z,Despite having only 3 confirmed #coronavirus c...,en
9,1238253440461135873,19047089,2020-03-13T00:00:00Z,Autonomous #Robots Are Helping Kill #Coronavir...,en


In [20]:
patterns = {
    'covid': '.*(covid|coronavirus).*',
    'spread': '.*(spread).*',
    'mask': '.*(mask).*',
    'social_distancing': '.*(social distanc).*',
    'trump': '.*(trump).*',
    'canada': '.*(canada).*',
    'trudeau': '.*(trudeau).*',
    'close': '.*(close).*',
}

In [21]:
result = df.select(
        'status_id',
        'created_at',
        F.from_unixtime(F.unix_timestamp('created_at', 'yyyy-MM-dd')).cast('timestamp').alias('date'),
        'text',
        *[(F.length(F.regexp_extract(F.lower(F.col("text")), p, 1)) > 0).cast('integer').alias('w_' + k) for k, p in patterns.items()]
    )

In [22]:
result.limit(10).toPandas()

Unnamed: 0,status_id,created_at,date,text,w_covid,w_spread,w_mask,w_social_distancing,w_trump,w_canada,w_trudeau,w_close
0,1238253442063310848,2020-03-13T00:00:00Z,2020-03-13,The UFC is about to be the most popular sport ...,1,0,0,0,0,0,0,0
1,1238253441778098177,2020-03-13T00:00:00Z,2020-03-13,The great toilet paper depression of 2020 #Toi...,1,0,0,0,0,0,0,0
2,1238253440486313988,2020-03-13T00:00:00Z,2020-03-13,The 'Spotlight Show' with @janeyleegrace on @u...,1,0,0,0,0,0,0,0
3,1238253439051870208,2020-03-13T00:00:00Z,2020-03-13,Because we all the time in the world right? @s...,1,0,0,0,0,0,0,0
4,1238253440821649408,2020-03-13T00:00:00Z,2020-03-13,French pastry chef shows off Easter eggs model...,1,0,0,0,0,0,0,0
5,1238253442034020354,2020-03-13T00:00:00Z,2020-03-13,ICYMI - Hour 2 of #TheGamePlan with @DaveWNSP ...,1,0,0,0,0,0,0,0
6,1238253441564266496,2020-03-13T00:00:00Z,2020-03-13,"With rising #Coronavirus cases in India, which...",1,1,0,0,0,0,0,0
7,1238253441517928448,2020-03-13T00:00:00Z,2020-03-13,#ICYMI: #Ontario #MPPs may temporarily suspend...,1,0,0,0,0,0,0,0
8,1238253440603541504,2020-03-13T00:00:00Z,2020-03-13,Despite having only 3 confirmed #coronavirus c...,1,0,0,0,0,0,0,0
9,1238253440461135873,2020-03-13T00:00:00Z,2020-03-13,Autonomous #Robots Are Helping Kill #Coronavir...,1,0,0,0,0,0,0,0


In [121]:
result.write\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .mode("overwrite")\
    .parquet("word_counts.parquet")

KeyboardInterrupt: 