As mentioned, SparkContext is the main object under which everything else can be used. Then, we need to pass this object with a batch interval (in this example, we use 10 seconds) into the StreamingContext object. By doing so, we're ready to create our own stream context via StreamingContext:

In [1]:
from pyspark import SparkContext # spark
from pyspark.streaming import StreamingContext # spark streaming
# Create a local StreamingContext with as many working processors as possible and a batch interval of 10 seconds            
batch_interval = 10

# local[*]: run Spark locally with as many working processors as logical cores on your machine.
sc = SparkContext(master="local[*]", appName = "WordCountApp") 

# a batch interval of 10 seconds   
ssc = StreamingContext(sc, batch_interval)

In [11]:
# Create a DStream connecting to hostname:port
host = "localhost"
port = 9999
lines = ssc.socketTextStream(host, port)

In [3]:
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))
lines.pprint()

In [5]:
# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [7]:
ds1 = sc.textFile("./unit_synopsis.txt")
ds1.take(5)
words = ds1.flatMap(lambda line: line.split(" "))
words.take(10)

stopwords = ['a', 'this', 'to', 'as', 'such', 'the', 'The', 'of', 'using', 'on', 'in', 'It', 'with', 'and', 'or']
words_without_stopwords = words.filter(lambda x: x not in stopwords)
words_without_stopwords.take(50)
pairs = words_without_stopwords.map(lambda word: (word, 1))
word_counts = pairs.reduceByKey(lambda x, y: x + y)
word_counts.take(10)

[('FIT9131', 1),
 ('provide', 1),
 ('basic', 1),
 ('concepts', 1),
 ('development', 3),
 ('programming', 2),
 ('concentrates', 1),
 ('solving', 1),
 ('stages', 1),
 ('Students', 2)]