In [1]:
%scala
val bikeStations = spark.read
.option("header","true")
.csv("/FileStore/tables/ltn1r4x11497653150836/201508_station_data.csv")
val tripData = spark.read
.option("header","true")
.csv("/FileStore/tables/ltn1r4x11497653150836/201508_trip_data.csv")

In [2]:
bikeStations = spark.read\
.option("header","true")\
.csv("/FileStore/tables/ltn1r4x11497653150836/201508_station_data.csv")
tripData = spark.read\
.option("header","true")\
.csv("/FileStore/tables/ltn1r4x11497653150836/201508_trip_data.csv")

In [3]:
%scala
val stationVertices = bikeStations
.withColumnRenamed("name", "id")
.distinct()
val tripEdges = tripData
.withColumnRenamed("Start Station", "src")
.withColumnRenamed("End Station", "dst")

In [4]:
stationVertices = bikeStations\
.withColumnRenamed("name", "id")\
.distinct()
tripEdges = tripData\
.withColumnRenamed("Start Station", "src")\
.withColumnRenamed("End Station", "dst")

In [5]:
%scala
import org.graphframes.GraphFrame
val stationGraph = GraphFrame(stationVertices, tripEdges)
tripEdges.cache()
stationVertices.cache()

In [6]:
from graphframes import GraphFrame
stationGraph = GraphFrame(stationVertices, tripEdges)
tripEdges.cache()
stationVertices.cache()

In [7]:
stationGraph.vertices.count
stationGraph.edges.count
tripData.count

In [8]:
%scala
import org.apache.spark.sql.functions.desc
stationGraph
.edges
.groupBy("src", "dst")
.count()
.orderBy(desc("count"))
.show(10)

In [9]:
from pyspark.sql.functions import desc
stationGraph\
.edges\
.groupBy("src", "dst")\
.count()\
.orderBy(desc("count"))\
.show(10)

In [10]:
%scala
stationGraph
.edges
.where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
.groupBy("src", "dst")
.count()
.orderBy(desc("count"))
.show(10)

In [11]:
stationGraph\
.edges\
.where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")\
.groupBy("src", "dst")\
.count()\
.orderBy(desc("count"))\
.show(10)

In [12]:
%scala
val townAnd7thEdges = stationGraph
.edges
.where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
val subgraph = GraphFrame(stationGraph.vertices, townAnd7thEdges)

In [13]:
townAnd7thEdges = stationGraph\
.edges\
.where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
subgraph = GraphFrame(stationGraph.vertices, townAnd7thEdges)

In [14]:
%scala
val ranks = stationGraph.pageRank
.resetProbability(0.15)
.maxIter(10)
.run()
ranks.vertices
.orderBy(desc("pagerank"))
.select("id", "pagerank")
.show(10)

In [15]:
ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10)
ranks.vertices\
.orderBy(desc("pagerank"))\
.select("id", "pagerank")\
.show(10)

In [16]:
%scala
val inDeg = stationGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, false)

In [17]:
%python
inDeg = stationGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)

In [18]:
%scala
val outDeg = stationGraph.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, false)

In [19]:
outDeg = stationGraph.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, False)

In [20]:
%scala
val degreeRatio = inDeg.join(outDeg, Seq("id"))
.selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")
degreeRatio
.orderBy(desc("degreeRatio"))
.show(10, false)
degreeRatio
.orderBy("degreeRatio")
.show(10, false)

In [21]:
degreeRatio = inDeg.join(outDeg, "id")\
.selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")
degreeRatio\
.orderBy(desc("degreeRatio"))\
.show(10, False)
degreeRatio\
.orderBy("degreeRatio")\
.show(10, False)

In [22]:
%scala
val bfsResult = stationGraph.bfs
.fromExpr("id = 'Townsend at 7th'")
.toExpr("id = 'Redwood City Medical Center'")
.maxPathLength(4)
.run()

In [23]:
bfsResult = stationGraph.bfs(
fromExpr="id = 'Townsend at 7th'",
toExpr="id = 'Redwood City Medical Center'",
maxPathLength=4)
bfsResult.show(10)

In [24]:
%scala
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")

In [25]:
%python
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")

In [26]:
%scala
cc = stationGraph.connectedComponents.run()

In [27]:
%python
cc = stationGraph.connectedComponents()

In [28]:
%scala
val scc = stationGraph
.stronglyConnectedComponents
.maxIter(3)
.run()

In [29]:
%python
scc = stationGraph.stronglyConnectedComponents(maxIter=3)
scc.groupBy("component").count().show()

In [30]:
%scala
val motifs = stationGraph
.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)")

In [31]:
motifs = stationGraph\
.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)")

In [32]:
%scala
import org.apache.spark.sql.functions.expr
motifs
// first simplify dates for comparisons
.selectExpr("*", """
cast(unix_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm')
as timestamp) as abStart
""",
"""
cast(unix_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm')
as timestamp) as bcStart
""",
"""
cast(unix_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm')
as timestamp) as caStart
""")
// ensure the same bike
.where("ca.`Bike #` = bc.`Bike #`")
.where("ab.`Bike #` = bc.`Bike #`")
// ensure different stations
.where("a.id != b.id")
.where("b.id != c.id")
// start times are correct
.where("abStart < bcStart")
.where("bcStart < caStart")
// order them all
.orderBy(expr("cast(caStart as long) - cast(abStart as long)"))
.selectExpr("a.id", "b.id", "c.id",
"ab.`Start Date`", "ca.`End Date`")
.limit(1)
.show(false)

In [33]:

motifs
# first simplify dates for comparisons
.selectExpr("*", """
cast(unix_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm')
as timestamp) as abStart
""",
"""
cast(unix_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm')
as timestamp) as bcStart
""",
"""
cast(unix_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm')
as timestamp) as caStart
""")
# ensure the same bike
.where("ca.`Bike #` = bc.`Bike #`")
.where("ab.`Bike #` = bc.`Bike #`")
# ensure different stations
.where("a.id != b.id")
.where("b.id != c.id")
# start times are correct
.where("abStart < bcStart")
.where("bcStart < caStart")
# order them all
.orderBy(expr("cast(caStart as long) - cast(abStart as long)"))
.selectExpr("a.id", "b.id", "c.id",
            "ab.`Start Date`", "ca.`End Date`")
.limit(1)
.show(False)