In [None]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

In [None]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']

# Tweet mini case study
<!-- requirement: small_data/tweets -->

This sample code goes through the streaming tweets you collected in `Spark_Streaming.ipynb` and shows how to import the JSON data into RDDs and DataFrames, then do some rudimentary analysis.

There are some sample tweets pre-loaded in `small_data/tweets/preloaded/` which are there to make sure this code works regardless of the previous notebook. You may delete those once you collect your own data.

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

from datetime import datetime
import json

In [None]:
import os
def localpath(path):
    return 'file://' + str(os.path.abspath(os.path.curdir)) + '/' + path

In [None]:
sc = SparkContext("local[*]", "demo")
print sc.version

In [None]:
sqlContext = SQLContext(sc)

In [None]:
# Creating an RDD from data on disk
jsonRDD = sc.textFile(localpath("small_data/tweets/*/part*"), minPartitions = 100)

In [None]:
# Experiment with changing the number of partitions. You can also use transformations like `repartition` or `coalesce`.
print jsonRDD.getNumPartitions()

In [None]:
# Open up the UI on port 4040 in another tab
print jsonRDD.count()

In [None]:
samples = jsonRDD.take(5)
print type(samples[0])
print samples[0]

In [None]:
json_sample = json.loads(samples[0])
print type(json_sample)

In [None]:
print json.dumps(json_sample, indent=4, sort_keys=True)

In [None]:
print json_sample["text"]
print json_sample["createdAt"]

Let's look at how we can access individual elements across the entire dataset!

## Spark SQL and DataFrames - a convenient abstraction

In [None]:
# Creating a DataFrame from data on disk, and registering it in the temporary Hive metastore
raw_df = sqlContext.read.json(localpath("small_data/tweets/*/part-*"))
raw_df.registerTempTable("tweets")

In [None]:
raw_df.filter(raw_df["user"]["followersCount"] > 50).select(["text", "isFavorited"]).show(5)

In [None]:
df = sqlContext.sql("SELECT user.lang, COUNT(*) as cnt FROM tweets GROUP BY user.lang ORDER BY cnt DESC LIMIT 25")
df.show()

For more complicated operations, pre-defined or user-defined functions may be necessary. You can always drop down to the RDD level for more granular manipulation.

## Caching and persistence - the key to Spark's speed

In [None]:
timestamps = jsonRDD.map(lambda x: json.loads(x)) \
                    .map(lambda x: (x, x["createdAt"])) \
                    .mapValues(lambda x: datetime.strptime(x, "%b %d, %Y %I:%M:%S %p")) \
                    .cache()

In [None]:
%%timeit -r1 -n1
print timestamps.count()

In [None]:
%%timeit -r1 -n1
print timestamps.count()

In [None]:
timestamps.take(1)

*Note:* Many common transformations work across Spark: on DStreams, DataFrames, and RDDs.

In [None]:
timestamps.filter(lambda x: x[1].minute == 35).count()

In [None]:
# A bit easier to read
timestamps.filter(lambda (blob, time): time.minute == 35).count()

In [None]:
def string_to_boolean_tuple(target, string):
    if target in string:
        return (1, 1)
    else:
        return (0, 1)

plot_data = timestamps.map(lambda (key, value): (value, key)) \
                      .map(lambda (time, tweet): (time.second, string_to_boolean_tuple("RT", tweet["text"]))) \
                      .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
                      .mapValues(lambda (rts, total): 1.0 * rts / total) \
                      .collect()

*Note:* Scala makes helper functions like the above easier to write inline, which helps with code readability and succintness. The Python API has less freedom in this regard.

In [None]:
print type(plot_data)
print len(plot_data)
print plot_data[0]
x_data = [tup[0] for tup in plot_data]
y_data = [tup[1] for tup in plot_data]

In [None]:
matplotlib.pyplot.plot(x_data, y_data)

**Exercise**: Gather a larger sample of tweets using eg. the template in the Spark Streaming notebook, and apply the above analysis on longer time scales.

*Extension:* Perform the same analysis, but directly on the DStream. Note the difference in the Streaming UI.

In [None]:
sc.stop()

*Copyright &copy; 2016 The Data Incubator.  All rights reserved.*