<a href="https://colab.research.google.com/github/zw2497/Twitter_Stream_Processing/blob/master/PySpark_Structured_Streaming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Java, Spark, and Findspark
This installs Apache Spark 2.4.0, Java 8, and [Findspark](https://github.com/minrk/findspark), a library that makes it easy for Python to find Spark.

In [1]:
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz
!tar xf spark-2.4.2-bin-hadoop2.7.tgz
# !pip -q install findspark

#### Set Environment Variables
Set the locations where Spark and Java are installed.

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.2-bin-hadoop2.7"

# Start a SparkSession
This will start a local Spark session.

In [32]:
%%writefile ./ldskafkaclient.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import *


spark = SparkSession \
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()
spark.sparkContext.setLogLevel('FATAL')

df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("subscribe", "tweepyv1") \
  .option("startingOffsets", "latest") \
  .option("failOnDataLoss", "false") \
  .load()


df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

print("The data type is stream: ", df.isStreaming, " Start receiving ... ...")
print(dir(df))

df.createOrReplaceTempView("pre");
df = spark.sql(
    """select get_json_object(decode(value, 'utf-8'), '$.source') as key, count(*) as value
        from pre
        group by get_json_object(decode(value, 'utf-8'), '$.source')
        order by value DESC""");

# df.createOrReplaceTempView("pre");
# df = spark.sql(
#     """select regexp_extract(get_json_object(decode(value, 'utf-8'), '$.source'), "[^>]*>([^<]*)<[^<]", 1) as source, count(*) as count
#         from pre
#         group by regexp_extract(get_json_object(decode(value, 'utf-8'), '$.source'), "[^>]*>([^<]*)<[^<]", 1)
#         order by count DESC""");


# df.createOrReplaceTempView("tag");
# df = spark.sql("""select get_json_object(entities, '$.hashtags[0].text') as hashtag, timestamp 
#                     from tag 
#                     where length(get_json_object(entities, '$.hashtags[0].text')) > 2""");


# df.createOrReplaceTempView("win");
# df = spark.sql("""
# select concat_ws(' ', hashtag, string(count(*))) as value, now() as key
# from win
# group by window(timestamp, "10 seconds", "5 seconds"), hashtag
# """)






"""
Write to Console
"""
# query1 = df \
#     .writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .option("truncate", "False") \
#     .option("checkpointLocation", "./logclient") \
#     .start()

"""
Write to kafka
"""
query1 = df \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .outputMode("complete") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("topic", "source") \
  .option("checkpointLocation", "./logclient") \
  .start()

query1.awaitTermination()

Overwriting ./ldskafkaclient.py


In [None]:
!./spark-2.4.2-bin-hadoop2.7/bin/spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:2.4.2 ldskafkaclient.py

Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/spark-2.4.2-bin-hadoop2.7/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ace7b27d-4c9e-45a7-84a1-63cde34d12c3;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;2.4.2 in central
	found org.apache.kafka#kafka-clients;2.0.0 in central
	found org.lz4#lz4-java;1.4.0 in central
	found org.xerial.snappy#snappy-java;1.1.7.3 in central
	found org.slf4j#slf4j-api;1.7.16 in central
	found org.spark-project.spark#unused;1.0.0 in central
:: resolution report :: resolve 1043ms :: artifacts dl 28ms
	:: modules in use:
	org.apache.kafka#kafka-clients;2.0.0 from central in [default]
	org.apache.spark#spark-sql-kafka-0-10_2.12;2.4.2 from central in [default]
	org.lz4#lz4-java;1