As mentioned, SparkContext is the main object under which everything else can be used. Then, we need to pass this object with a batch interval (in this example, we use 10 seconds) into the StreamingContext object. By doing so, we're ready to create our own stream context via StreamingContext:

In [1]:
from pyspark import SparkContext # spark
from pyspark.streaming import StreamingContext # spark streaming
# Create a local StreamingContext with as many working processors as possible and a batch interval of 10 seconds            
batch_interval = 1

# local[*]: run Spark locally with as many working processors as logical cores on your machine.
sc = SparkContext(master="local[1]", appName = "WordCountApp") 

# a batch interval of 10 seconds   
ssc = StreamingContext(sc, batch_interval)

In [2]:
# Create a DStream connecting to hostname:port
host = "localhost"
port = 9999
lines = ssc.socketTextStream(host, port)

In [3]:
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))
lines.pprint()

In [4]:
# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [5]:
ds1 = sc.textFile("./unit_synopsis.txt")
words = ds1.flatMap(lambda line: line.split(" "))
words.take(20)

['FIT9131',
 'This',
 'unit',
 'aims',
 'to',
 'provide',
 'students',
 'with',
 'the',
 'basic',
 'concepts',
 'involved',
 'in',
 'the',
 'development',
 'of',
 'well',
 'structured',
 'software',
 'using']

In [6]:
stopwords = ['a', 'this', 'to', 'as', 'such', 'the', 'The', 'of', 'using', 'on', 'in', 'It', 'with', 'and', 'or']
words_without_stopwords = words.filter(lambda x: x not in stopwords)
words_without_stopwords.take(50)
pairs = words_without_stopwords.map(lambda word: (word, 1))

In [7]:
from operator import add
word_counts = pairs.reduceByKey(add)
word_counts.take(10)

[('FIT9131', 1),
 ('This', 3),
 ('unit', 4),
 ('aims', 1),
 ('provide', 1),
 ('students', 1),
 ('basic', 1),
 ('concepts', 1),
 ('involved', 1),
 ('development', 3)]

## Here starts the real streaming

In [12]:
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":

    # We add this line to avoid an error : "Cannot run multiple SparkContexts at once". If there is an existing spark context, we will reuse it instead of creating a new context.
    sc = SparkContext.getOrCreate()

    # If there is no existing spark context, we now create a new context
    if (sc is None):
        sc = SparkContext(appName="WordCountApp")
    ssc = StreamingContext(sc, 2)

    host = "localhost"
    port = 9999

    lines = ssc.socketTextStream(host, int(port))

    # Split each line into words
    words = lines.flatMap(lambda line: line.split(" "))

    # Count each word in each batch
    pairs = words.map(lambda word: (word, 1))
    wordCounts = pairs.reduceByKey(lambda x, y: x + y)

    # Print the result                            
    wordCounts.pprint()

    ssc.start()
    try:
        ssc.awaitTermination(timeout=60)
    except KeyboardInterrupt:
        ssc.stop()
        sc.stop()

    ssc.stop()
    sc.stop()

-------------------------------------------
Time: 2018-05-07 14:52:34
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:36
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:38
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:40
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:42
-------------------------------------------
('1', 1)

-------------------------------------------
Time: 2018-05-07 14:52:44
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:46
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:48
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:52:50
-

In [13]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# add the new values with the previous running count to get the new count
def updateFunc(new_values, prev_running_count):
    return sum(new_values) + (prev_running_count or 0)
    
# We add this line to avoid an error : "Cannot run multiple SparkContexts at once". If there is an existing spark context, we will reuse it instead of creating a new context.
sc = SparkContext.getOrCreate()

# If there is no existing spark context, we now create a new context
if (sc is None):
    sc = SparkContext(appName="WordCountApp")
ssc = StreamingContext(sc, 2)
ssc.checkpoint("checkpoint")

host = "localhost"
port = 9999

lines = ssc.socketTextStream(host, int(port))

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.updateStateByKey(updateFunc)

# Print the result                            
wordCounts.pprint()

ssc.start()
try:
    ssc.awaitTermination(timeout=60)
except KeyboardInterrupt:
    ssc.stop()
    sc.stop()

ssc.stop()
sc.stop()

-------------------------------------------
Time: 2018-05-07 14:57:48
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:57:50
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:57:52
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:57:54
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:57:56
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:57:58
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:58:00
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:58:02
-------------------------------------------

-------------------------------------------
Time: 2018-05-07 14:58:04
----------