In [1]:
f = sqlContext.read.format("com.databricks.spark.csv").\
    options(header="true", inferSchema = "true").load("2008.csv.bz2")

In [3]:
from pyspark.sql.functions import *

averagedelays = f.groupBy(f.FlightNum).\
agg(avg("DepDelay"), avg("ArrDelay"))
averagedelays.show(2)

+---------+------------------+------------------+
|FlightNum|     avg(DepDelay)|     avg(ArrDelay)|
+---------+------------------+------------------+
|     1580| 5.152813067150635| 3.259569814072184|
|      463|14.447111111111111|12.069456812110419|
+---------+------------------+------------------+
only showing top 2 rows



## Plotting the data

In [4]:
import plotly.offline as py
from plotly.graph_objs import *
py.init_notebook_mode()
data = Data([Histogram(x=averagedelays.toPandas()["avg(DepDelay)"])])

In [5]:
py.iplot(data)

In the next example, a histogram with two series is plotted. For visibility, the bins are fixed so that there is room to plot the series side by side.

In [6]:
data = Data([
        Histogram(x=averagedelays.toPandas()["avg(DepDelay)"], 
                  xbins=dict(start=0, end=100, size=1)),
        Histogram(x=averagedelays.toPandas()["avg(ArrDelay)"], 
                  xbins=dict(start=0, end=100, size=1))
    ])
py.iplot(data)

To add a derived column to a DataFrame, often and UserDefinedFunction (UDF) is needed, since these are not evaluated straight away, but rather when called with an element.

In [7]:
from pyspark.sql.types import *

addFlight = udf(lambda x: 'flight ' + str(x), returnType=StringType())

flightdelays = averagedelays.withColumn("Flight", addFlight('FlightNum'))


In [8]:
data = ([
    Scatter(x=flightdelays.toPandas()["avg(DepDelay)"],
            y=flightdelays.toPandas()["avg(ArrDelay)"],
            marker=Marker(size=2 + flightdelays.toPandas()["avg(ArrDelay)"] / 10),
            text=flightdelays.toPandas()["Flight"],
            mode='markers'
    )])
py.iplot(data)