In [2]:
from pyspark.streaming import StreamingContext

# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
lines = ssc.socketTextStream('localhost', 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
lines.pprint()
wordCounts.pprint()

ssc.start()  # Start the computation
print "Start"
ssc.awaitTermination(20)  # Wait for the computation to terminate
ssc.stop(stopSparkContext=False)  # Stop the StreamingContext without stopping the SparkContext

print "Finished"

Start
-------------------------------------------
Time: 2018-11-12 20:08:55
-------------------------------------------

-------------------------------------------
Time: 2018-11-12 20:08:55
-------------------------------------------

-------------------------------------------
Time: 2018-11-12 20:09:00
-------------------------------------------
a a a a a
b b a a a

-------------------------------------------
Time: 2018-11-12 20:09:00
-------------------------------------------
(u'a', 8)
(u'b', 2)

-------------------------------------------
Time: 2018-11-12 20:09:05
-------------------------------------------
a a a aa aa

-------------------------------------------
Time: 2018-11-12 20:09:05
-------------------------------------------
(u'a', 3)
(u'aa', 2)

-------------------------------------------
Time: 2018-11-12 20:09:10
-------------------------------------------

-------------------------------------------
Time: 2018-11-12 20:09:10
-------------------------------------------

F

In [3]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)

# split the rdd into 5 equal-size parts
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
        
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Feed the rdd queue to a DStream
lines = ssc.queueStream(rddQueue)

# Do word-counting as before
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Use transform() to access any rdd transformations not directly available in SparkStreaming
topWords = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
topWords.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(25)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

-------------------------------------------
Time: 2018-11-12 20:26:00
-------------------------------------------
(u'other', 15501)
(u'first', 10648)
(u'many', 9729)
(u'new', 6344)
(u'system', 5135)
(u'american', 4718)
(u'several', 4542)
(u'=', 4442)
(u'same', 4418)
(u'century', 4418)
...

-------------------------------------------
Time: 2018-11-12 20:26:05
-------------------------------------------
(u'other', 15128)
(u'first', 10625)
(u'many', 9593)
(u'new', 6201)
(u'system', 5091)
(u'american', 4838)
(u'century', 4571)
(u'several', 4502)
(u'=', 4410)
(u'same', 4329)
...

-------------------------------------------
Time: 2018-11-12 20:26:10
-------------------------------------------
(u'other', 15236)
(u'first', 10532)
(u'many', 9711)
(u'new', 6186)
(u'system', 5209)
(u'american', 4898)
(u'several', 4596)
(u'century', 4517)
(u'same', 4285)
(u'early', 4253)
...

-------------------------------------------
Time: 2018-11-12 20:26:15
-------------------------------------------
(u'other'

In [4]:
# Find the most positive words in windows of 5 seconds from streaming data

from pyspark.streaming import StreamingContext

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = sc.textFile("../data/AFINN-111.txt") \
                    .map(parse_line).cache()
    
ssc = StreamingContext(sc, 5)
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
lines = ssc.queueStream(rddQueue)

word_counts = lines.flatMap(lambda line: line.split(" ")) \
                   .map(lambda word: (word, 1)) \
                   .reduceByKey(lambda a, b: a + b)

# Determine the words with the highest sentiment values by joining the streaming RDD
# with the static RDD inside the transform() method and then multiplying
# the frequency of the words by its sentiment value
happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
                            .map(lambda (word, tuple):
                                 (tuple[0] * tuple[1], word)) \
                            .transform(lambda rdd: rdd.sortByKey(False))

happiest_words.pprint()

ssc.start()
ssc.awaitTermination(25)
ssc.stop(False)
print("Finished")

-------------------------------------------
Time: 2018-11-12 20:34:45
-------------------------------------------
(7560.0, u'great')
(6144.0, u'popular')
(5487.0, u'best')
(4869.0, u'good')
(4136.0, u'important')
(2284.0, u'strong')
(2268.0, u'greater')
(2115.0, u'successful')
(1854.0, u'novel')
(1660.0, u'natural')
...

-------------------------------------------
Time: 2018-11-12 20:34:50
-------------------------------------------
(7989.0, u'great')
(6081.0, u'popular')
(5517.0, u'best')
(4668.0, u'good')
(4252.0, u'important')
(2250.0, u'strong')
(2190.0, u'greater')
(2061.0, u'successful')
(1964.0, u'novel')
(1839.0, u'natural')
...

-------------------------------------------
Time: 2018-11-12 20:34:55
-------------------------------------------
(7740.0, u'great')
(6090.0, u'popular')
(5424.0, u'best')
(4758.0, u'good')
(4212.0, u'important')
(2428.0, u'strong')
(2313.0, u'greater')
(1959.0, u'successful')
(1934.0, u'novel')
(1851.0, u'greatest')
...

------------------------------

In [5]:
from pyspark.streaming import StreamingContext

# Stateful word count

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    print "Total distinct words: ", rdd.count()
    print rdd.take(5)
    print 'refinery:', rdd.lookup('refinery')[0]

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

Total distinct words:  51396
[(u'other', 7673), (u'first', 5348), (u'many', 4890), (u'new', 3206), (u'system', 2596)]
refinery: 2
Total distinct words:  76869
[(u'other', 15501), (u'first', 10648), (u'many', 9729), (u'new', 6344), (u'system', 5135)]
refinery: 7
Total distinct words:  97070
[(u'other', 22994), (u'first', 15880), (u'many', 14547), (u'new', 9468), (u'system', 7743)]
refinery: 11
Total distinct words:  114682
[(u'other', 30629), (u'first', 21273), (u'many', 19322), (u'new', 12545), (u'system', 10226)]
refinery: 14
Total distinct words:  130487
[(u'other', 38350), (u'first', 26539), (u'many', 24270), (u'new', 15619), (u'system', 12858)]
refinery: 18
Total distinct words:  144782
[(u'other', 45865), (u'first', 31805), (u'many', 29033), (u'new', 18731), (u'system', 15435)]
refinery: 22
Total distinct words:  158283
[(u'other', 53546), (u'first', 37076), (u'many', 33895), (u'new', 21750), (u'system', 17975)]
refinery: 25
Total distinct words:  170708
[(u'other', 61252), (u'fir

In [5]:
# MG algorithm for approximate word count

from pyspark.streaming import StreamingContext

k = 10000
threshold = 0
total_decrement = 0

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    newValue = sum(newValues, runningCount) - threshold
    return newValue if newValue > 0 else None
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)
            
counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    global threshold, total_decrement 
    rdd.cache()
    print "Total distinct words: ", rdd.count()
    print rdd.map(lambda x: (x[0], x[1], x[1]+total_decrement)).take(5)
    lower_bound = rdd.lookup('refinery')
    if len(lower_bound) > 0:
        lower_bound = lower_bound[0]
    else:
        lower_bound = 0
    print 'refinery:', lower_bound, ',', lower_bound + total_decrement
    if rdd.count() > k:
        threshold = rdd.zipWithIndex().map(lambda x: (x[1], x[0])).lookup(k)[0][1]
    else:
        threhold = 0
    print "Next threshold = ", threshold
    total_decrement += threshold
    rdd.unpersist()

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

Total distinct words:  51396
[(u'other', 7673, 7673), (u'first', 5348, 5348), (u'many', 4890, 4890), (u'new', 3206, 3206), (u'system', 2596, 2596)]
refinery: 2 , 2
Next threshold =  5
Total distinct words:  13974
[(u'other', 15496, 15501), (u'first', 10643, 10648), (u'many', 9724, 9729), (u'new', 6339, 6344), (u'system', 5130, 5135)]
refinery: 2 , 7
Next threshold =  5
Total distinct words:  12157
[(u'other', 22984, 22994), (u'first', 15870, 15880), (u'many', 14537, 14547), (u'new', 9458, 9468), (u'system', 7733, 7743)]
refinery: 1 , 11
Next threshold =  4
Total distinct words:  12389
[(u'other', 30615, 30629), (u'first', 21259, 21273), (u'many', 19308, 19322), (u'new', 12531, 12545), (u'system', 10212, 10226)]
refinery: 0 , 14
Next threshold =  5
Total distinct words:  11657
[(u'other', 38331, 38350), (u'first', 26520, 26539), (u'many', 24251, 24270), (u'new', 15600, 15619), (u'system', 12839, 12858)]
refinery: 0 , 19
Next threshold =  5
Total distinct words:  11364
[(u'other', 45841,

In [4]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rddQueue = []
for i in range(5):
    rdd = sc.parallelize([i, i, i, i, i])
    rddQueue += [rdd]
        
# Create a StreamingContext with batch interval of 3 seconds
ssc = StreamingContext(sc, 3)

ssc.checkpoint("checkpoint")

# Feed the rdd queue to a DStream
nums = ssc.queueStream(rddQueue)

# Compute the sum over a sliding window of 9 seconds for every 3 seconds
# slidingSum = nums.reduceByWindow(lambda x, y: x + y, None, 9, 3)
slidingSum = nums.reduceByWindow(lambda x, y: x + y, lambda x, y: x - y, 9, 3)

slidingSum.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(24)  # Wait for the computation to terminate
ssc.stop(False)
print "Finished"

-------------------------------------------
Time: 2017-11-13 21:41:03
-------------------------------------------
0

-------------------------------------------
Time: 2017-11-13 21:41:06
-------------------------------------------
5

-------------------------------------------
Time: 2017-11-13 21:41:09
-------------------------------------------
15

-------------------------------------------
Time: 2017-11-13 21:41:12
-------------------------------------------
30

-------------------------------------------
Time: 2017-11-13 21:41:15
-------------------------------------------
45

-------------------------------------------
Time: 2017-11-13 21:41:18
-------------------------------------------
35

-------------------------------------------
Time: 2017-11-13 21:41:21
-------------------------------------------
20

-------------------------------------------
Time: 2017-11-13 21:41:24
-------------------------------------------

Finished


In [3]:
from pyspark.sql.functions import *


lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

word_counts = words.groupBy('word').count()

# Start running the query 
query = word_counts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print "Finished"

Finished


In [10]:
from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

long_words = words.filter(length(words['word'])>=3)

# Start running the query 
query = long_words\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print "Finished"

Finished


In [None]:
from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

windowedCounts = words.groupBy(
    window(words.timestamp, "10 seconds", "5 seconds"),
    words.word)\
    .count()

# Start running the query 
query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print "Finished"