# In colab, please uncomment these commands

This installs Apache Spark 2.4.0, Java 8, and Findspark, a library that makes it easy for Python to find Spark.

In [None]:
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q https://www-us.apache.org/dist/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz
# !tar xf spark-2.4.2-bin-hadoop2.7.tgz
# !pip -q install findspark

# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "./spark-2.4.2-bin-hadoop2.7"

In [1]:
!pip install -U textblob
!python -m textblob.download_corpora

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K    100% |████████████████████████████████| 645kB 22.0MB/s 
[?25hCollecting nltk>=3.1 (from textblob)
[?25l  Downloading https://files.pythonhosted.org/packages/73/56/90178929712ce427ebad179f8dc46c8deef4e89d4c853092bee1efd57d05/nltk-3.4.1.zip (3.1MB)
[K    100% |████████████████████████████████| 3.1MB 8.3MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/97/8a/10/d646015f33c525688e91986c4544c68019b19a473cb33d3b55
Successfully built nltk
Installing collected packages: nltk, textblob
Successfully installed nltk-3.4.1 textblob-0.15.3
[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /h

# Tweeter Topk 
Top k popular hashtags in two different windows, post sentiment analysis, trend detection

In [10]:
from pyspark.sql.functions import udf, get_json_object, explode, window, concat_ws, count, avg
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType

from textblob import TextBlob

from sklearn import linear_model

import pandas as pd
import numpy as np

import time
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local --packages org.apache.spark:spark-sql-kafka-0-10_2.12:2.4.2 pyspark-shell"

In [2]:
spark = SparkSession \
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()
spark.sparkContext.setLogLevel('FATAL')

In [81]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("subscribe", "tweepyv1") \
  .option("startingOffsets", "latest") \
  .option("failOnDataLoss", "false") \
  .load()

df.createOrReplaceTempView("raw")
df = spark.sql("""select decode(value, 'utf-8') as value, timestamp 
                  from raw""");

In [82]:
@udf(FloatType())
def senti(x):
    blob = TextBlob(x)
    s = []
    for sentence in blob.sentences:
        s.append(sentence.sentiment.polarity)
    return sum(s)/len(s)

In [83]:
df = df.select('timestamp',\
               get_json_object('value', '$.entities.hashtags[0].text').alias("hashtag"), \
               senti(get_json_object('value', '$.text')).alias("sentiment"))
df = df.filter(df.hashtag.isNotNull())

## Trend detection

In [31]:
dftrend = spark.sql("""
select distinct hashtag, count(*) as count_num, avg(sentiment) as sentiment, now() as timestamp
from datas
group by hashtag, window(timestamp, "120 seconds", "30 seconds")
""")

In [32]:
@pandas_udf("key string, value double", PandasUDFType.GROUPED_MAP)  # doctest: +SKIP
def trend_udf(key, pdf):
    reg = linear_model.LinearRegression()
    reg.fit(np.array(pd.to_datetime(pdf.timestamp).astype('int')).reshape(-1,1), np.array(pdf.count_num).reshape(-1,1))
    return pd.DataFrame([key + (reg.coef_[0][0],)])

dftrend = dftrend.groupby('hashtag').apply(trend_udf)

# Top k

In [93]:
dffast = df.withWatermark("timestamp", "0 second")\
        .groupby(df.hashtag, window("timestamp", "30 seconds"))\
        .agg(count('hashtag').alias('countn'), avg('sentiment').alias('sentiment'))

In [94]:
dffast.printSchema()

root
 |-- hashtag: string (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- countn: long (nullable = false)
 |-- sentiment: double (nullable = true)



In [95]:
dffast = dffast.select(concat_ws(' ', dffast.hashtag, dffast.countn, dffast.sentiment).alias('value'), dffast.window.alias('key'))

In [96]:
dffast.printSchema()

root
 |-- value: string (nullable = false)
 |-- key: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)



In [34]:
dfslow = spark.sql("""
select distinct concat_ws(' ',hashtag, count(*), avg(sentiment)) as value, now() as key
from datas
group by hashtag, window(timestamp, "600 seconds", "120 seconds")
""")

In [35]:
dffast = spark.sql("""
select distinct concat_ws(' ',hashtag, count(*), avg(sentiment)) as value, now() as key
from datas
group by hashtag, window(timestamp, "30 seconds", "5 seconds")
""")

In [36]:
query = dftrend.writeStream.outputMode("complete").queryName("trend").format("memory").option("truncate", "False").start()
query1 = dfslow.writeStream.outputMode("complete").queryName("slow").format("memory").option("truncate", "False").start()
query2 = dffast.writeStream.outputMode("complete").queryName("fast").format("memory").option("truncate", "False").start()

In [99]:
print(query2.status)

{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}


In [40]:
print(query.status)
print(query1.status)
print(query2.status)

{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}
{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}
{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}


In [41]:
res = spark.table("trend").toPandas()
res1 = spark.table("slow").toPandas()
res2 = spark.table("fast").toPandas()

In [42]:
res2

Unnamed: 0,value,key


In [123]:
res

Unnamed: 0,key,value
0,EndGame,0.0
1,KentuckyDerby,0.0
2,Germany,0.0
3,PrudentialCenterROCK,0.0
4,deznat,0.0
5,ATXWX,0.0
6,PeyTen,0.0
7,USGS303401097374700,0.0
8,LACED,0.0
9,7minutes,0.0


In [124]:
res1

Unnamed: 0,value,key
0,TedBundy 1 -0.03750000149011612,2019-05-04 02:26:00.852
1,SoundHound 1 0.0,2019-05-04 02:26:00.852
2,SSNCT 1 0.5,2019-05-04 02:26:00.852
3,foodphotography 1 0.0,2019-05-04 02:26:00.852
4,pomona 1 0.0,2019-05-04 02:26:00.852
5,BoomerSooner 1 0.0,2019-05-04 02:26:00.852
6,Houston 1 0.06818182021379471,2019-05-04 02:26:00.852
7,RealTime 1 0.10000000149011612,2019-05-04 02:26:00.852
8,7minutes 1 0.0,2019-05-04 02:26:00.852
9,goodNIGHTTwitterWorld 1 0.0,2019-05-04 02:26:00.852


In [26]:
res2

Unnamed: 0,value,key
0,CadetBoneSpurs 1 0.0,2019-05-05 03:24:53.085
1,challenge 1 -0.3888888955116272,2019-05-05 03:24:53.085
2,MayThe4thBeWithYou 1 0.30000001192092896,2019-05-05 03:24:53.085
3,xslasvegas 1 0.0,2019-05-05 03:24:53.085
4,lanaparrilla 1 0.4333333373069763,2019-05-05 03:24:40.317
5,Cruise 1 0.0625,2019-05-05 03:24:24.540
6,hiphop 1 0.0,2019-05-05 03:24:24.540
7,NBAPlayoffs 2 -0.20000000298023224,2019-05-05 03:24:53.085
8,GiFs 1 0.0,2019-05-05 03:24:53.085
9,Rockets 3 -0.017283950001001358,2019-05-05 03:24:40.317


In [127]:
query = dftrend \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .outputMode("complete") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("topic", "trend") \
  .option("checkpointLocation", "./logtrend") \
  .start()

query1 = dfslow \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .outputMode("update") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("topic", "slow") \
  .option("checkpointLocation", "./logslow") \
  .start()

query2 = dffast \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .outputMode("update") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("topic", "fast") \
  .option("checkpointLocation", "./logfast") \
  .start()

In [57]:
query1 = dfslow \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .outputMode("complete") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("topic", "slow") \
  .option("checkpointLocation", "./logslowtest") \
  .start()

In [101]:
query2 = dffast \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .outputMode("update") \
  .option("kafka.bootstrap.servers", "35.243.144.79:9092") \
  .option("topic", "fast") \
  .option("checkpointLocation", "./logfast") \
  .start()

In [43]:
query.stop()
query1.stop()
query2.stop()

In [102]:
query2.stop()

In [59]:
query1.stop()