In [1]:
from pyspark.sql import SparkSession
from graphframes import *

In [6]:
spark = SparkSession.builder\
    .master("local")\
    .appName("graphframes")\
    .config("spark.jars", "/home/eric/src/graphframes/graphframes-0.8.2-spark3.2-s_2.12.jar")\
    .getOrCreate()

spark.sparkContext.addPyFile('/home/eric/src/graphframes/graphframes-0.8.2-spark3.2-s_2.12.jar')

spark

In [5]:
spark.stop()

In [8]:
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

                                                                                

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+





+---+------------------+
| id|          pagerank|
+---+------------------+
|  c|1.8994109890559092|
|  b|1.0905890109440908|
|  a|              0.01|
+---+------------------+



                                                                                

In [12]:
bikeStations = spark.read.option("header","true")\
  .csv("data/201508_station_data.csv")
tripData = spark.read.option("header","true")\
  .csv("data/201508_trip_data.csv")


# COMMAND ----------

stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
tripEdges = tripData\
  .withColumnRenamed("Start Station", "src")\
  .withColumnRenamed("End Station", "dst")

In [13]:
stationGraph = GraphFrame(stationVertices, tripEdges)
stationGraph.cache()

GraphFrame(v:[id: string, station_id: string ... 5 more fields], e:[src: string, dst: string ... 9 more fields])

In [15]:
print("Total Number of Stations: " + str(stationGraph.vertices.count()))
print("Total Number of Trips in Graph: " + str(stationGraph.edges.count()))
print("Total Number of Trips in Original Data: " + str(tripData.count()))

                                                                                

Total Number of Stations: 70


                                                                                

Total Number of Trips in Graph: 354152
Total Number of Trips in Original Data: 354152


### Find all the different unique paths per bike and see how many bikes traversed the exact path