In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

# cluster execution
spark = SparkSession.builder \
 .master("yarn") \
 .appName("Task3") \
 .config("spark.executor.instances", "2") \
 .config("spark.executor.cores", "2") \
 .config("spark.executor.memory", "2048M") \
 .getOrCreate()

"""
# local execution
spark = SparkSession \
  .builder \
  .enableHiveSupport() \
  .config(conf=SparkConf().set("spark.driver.maxResultSize", "2g")) \
  .appName("test") \
  .getOrCreate()
"""
sc = spark.sparkContext

22/05/13 21:07:20 WARN Utils: Your hostname, acer resolves to a loopback address: 127.0.1.1; using 192.168.0.40 instead (on interface wlp4s0)
22/05/13 21:07:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/13 21:07:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/13 21:07:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/13 21:07:21 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DateType, LongType
from pyspark.sql.functions import *


tweet_schema = StructType([
    StructField("account_created_at", StringType(), True),
    StructField("account_lang",   StringType(), True),
    StructField("country_code", StringType(), True),
    StructField("created_at", DateType(), True),
    StructField("favourites_count", IntegerType(), True),
    StructField("followers_count", IntegerType(), True),
    StructField("friends_count", IntegerType(), True),
    StructField("is_quote", StringType(), True),
    StructField("is_retweet", StringType(), True),
    StructField("lang", StringType(), True),
    StructField("place_full_name", StringType(), True),
    StructField("place_type", StringType(), True),
    StructField("reply_to_screen_name", StringType(), True),
    StructField("reply_to_status_id", StringType(), True),
    StructField("reply_to_user_id", StringType(), True),
    StructField("retweet_count", IntegerType(), True),
    StructField("screen_name", StringType(), True),
    StructField("source", StringType(), True),
    StructField("status_id", StringType(), True),
    StructField("text", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("verified", StringType(), True),
])
# df = spark.read.json("./covid.json", schema=tweet_schema)
df = spark.read.json("hdfs:/datasets/covid/", schema=tweet_schema)
tf = df.select(df.screen_name, df.created_at, df.text).filter(
    ((lower(df.text).contains("#corona") | lower(df.text).contains("#covid")) & lower(df.text).contains("#infected")) |
    lower(df.text).contains("#ihavecorona") |
    lower(df.text).contains("#ihavecovid"))
tf = tf.withColumn('utimestamp', unix_timestamp('created_at', 'yyyy-MM-dd'))
tf.show()

[Stage 3:>                                                          (0 + 4) / 4]

+---------------+----------+--------------------+----------+
|    screen_name|created_at|                text|utimestamp|
+---------------+----------+--------------------+----------+
|     marykkeown|2020-03-31|.@PublicHealthSD ...|1585623600|
| _CeeCeeElaine_|2020-03-31|Reading, writing,...|1585623600|
| Stevevillano11|2020-03-31|@morethanmySLE Ma...|1585623600|
|christalkstoyou|2020-03-31|When would a pand...|1585623600|
|       Myananoa|2020-03-31|#StopAiringTrump ...|1585623600|
|KarenRi25596554|2020-03-31|Today, the world ...|1585623600|
|      Artstrada|2020-03-31|Imagine if #Ameri...|1585623600|
|  ArchivesofMed|2020-03-31|Reduce the spread...|1585623600|
|        spyblog|2020-03-31|@frank_rieger @al...|1585623600|
|      WATLEYIII|2020-03-31|@5589com #coronav...|1585623600|
|     Waqtnewstv|2020-03-31|ایران میں 35 دن ک...|1585623600|
|     Waqtnewstv|2020-03-31|شاہ سلمان نے کرون...|1585623600|
|    SKSingh_dav|2020-03-31|@DrDBhasin How th...|1585623600|
|josephthavaraja|2020-03



In [None]:
from pyspark.sql import Window
w = Window().partitionBy("utimestamp").orderBy("utimestamp").rangeBetween(-7*86400, 0)
ttf = tf.groupby("utimestamp", "created_at").agg(countDistinct("screen_name").alias("count"))
ttf = ttf.withColumn('moving_average', avg("count").over(w))
ttf.show()

In [None]:
import matplotlib.pyplot as plt

ttf.select("created_at","moving_average").coalesce(1).write.mode('overwrite').option('header','false').csv('out3/')
time_dates = [dd.created_at for dd in ttf.select('created_at').collect()]
moving_averages = [dd.moving_average for dd in ttf.select('moving_average').collect()]

def plot_time_series(times, data):
    plt.figure(figsize=(13, 10))
    plt.plot(times, data, linewidth=3)
   
    plt.title('Moving average for people who posted using #ihavecorona', fontsize=25)
    plt.xlabel('Date')
    plt.ylabel('7 days moving average')
    plt.tight_layout()   
    plt.show()

plot_time_series(time_dates, moving_averages)