In [1]:
#EXample from: https://gist.github.com/rmoff/fb033086b285655ffe7f9ff0582dedbf
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

In [2]:

#    Spark#    Sp 
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json

In [3]:
#Create Spark context
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01")
sc.setLogLevel("WARN")

In [4]:
#Create Streaming Context
ssc = StreamingContext(sc, 60)

In [5]:
#Connect to Kafka
#Connect to twitter with consumer group named 'spark-streaming'
kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181', 'spark-streaming', {'twitter':1})

Message Processing
Parse the inbound message as json
The inbound stream is a DStream, which supports various built-in transformations such as map which is used here to parse the inbound messages from their native JSON format.

Note that this will fail horribly if the inbound message isn't valid JSON.

In [6]:
parsed = kafkaStream.map(lambda v: json.loads(v[1]))

In [7]:
parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

# Extract Author name from each tweet
Tweets come through in a JSON structure, of which you can see an example here. We're going to analyse tweets by author, which is accessible in the json structure at user.screen_name.

The lambda anonymous function is used to apply the map to each RDD within the DStream. The result is a DStream holding just the author's screenname for each tweet in the original DStream.



In [8]:
authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name'])

In [13]:
#Count the number of tweets per author

author_counts = authors_dstream.countByValue()
author_counts

<pyspark.streaming.dstream.TransformedDStream at 0x2167ed58d68>

In [14]:
author_counts_sorted_dstream = author_counts.transform(\
  (lambda foo:foo\
   .sortBy(lambda x:( -x[1]))))
#   .sortBy(lambda x:(x[0].lower(), -x[1]))\
#  ))

In [16]:
author_counts_sorted_dstream.pprint()

In [17]:
#Get top 5 authors by tweet count
top_five_authors = author_counts_sorted_dstream.transform\
  (lambda rdd:sc.parallelize(rdd.take(5)))
top_five_authors.pprint()

In [18]:
#Get authors with more than one tweet, or whose username starts with 'a'
filtered_authors = author_counts.filter(lambda x:\
                                                x[1]>1 \
                                                or \
                                                x[0].lower().startswith('rm'))

In [19]:
filtered_authors.transform\
  (lambda rdd:rdd\
  .sortBy(lambda x:-x[1]))\
  .pprint()

In [20]:
#List the most common words in the tweets
parsed.\
    flatMap(lambda tweet:tweet['text'].split(" "))\
    .countByValue()\
    .transform\
      (lambda rdd:rdd.sortBy(lambda x:-x[1]))\
    .pprint()

Start the streaming context
Having defined the streaming context, now we're ready to actually start it! When you run this cell, the program will start, and you'll see the result of all the pprint functions above appear in the output to this cell below. If you're running it outside of Jupyter (via spark-submit) then you'll see the output on stdout.

I've added a timeout to deliberately cancel the execution after three minutes. In practice, you would not set this :)

In [21]:
ssc.start()