In [2]:
sc.addPyFile("../graphframes-0.5.0-spark2.1-s_2.11.jar")

In [3]:
from graphframes import *
from pyspark.sql.functions import *

In [5]:
# Vertics DataFrame
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 37),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 38),
  ("g", "Gabby", 60)
], ["id", "name", "age"])

# Edges DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend"),
  ("g", "e", "follow")
], ["src", "dst", "relationship"])

# Create a GraphFrame
g = GraphFrame(v, e)

g.vertices.show()
g.edges.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 37|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 38|
|  g|  Gabby| 60|
+---+-------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
|  g|  e|      follow|
+---+---+------------+



In [5]:
# g.vertices and g.edges are just DataFrames
# You can use any DataFrame API on them

g.edges.filter("src = 'a'").show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  a|  e|      friend|
+---+---+------------+



In [6]:
g.edges.filter("src = 'a'").count()

2

In [7]:
# Count the number of followers of c.
# This queries the edge DataFrame.
print g.edges.filter("relationship = 'follow' and dst = 'c'").count()

2


In [8]:
# A GraphFrame has additional attributes

g.outDegrees.show()

+---+---------+
| id|outDegree|
+---+---------+
|  g|        1|
|  f|        1|
|  e|        2|
|  d|        1|
|  c|        1|
|  b|        1|
|  a|        2|
+---+---------+



In [9]:
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       2|
|  d|       1|
|  c|       2|
|  b|       2|
|  a|       1|
+---+--------+



In [10]:
g.inDegrees.explain()

== Physical Plan ==
*HashAggregate(keys=[dst#8], functions=[count(1)])
+- Exchange hashpartitioning(dst#8, 200)
   +- *HashAggregate(keys=[dst#8], functions=[partial_count(1)])
      +- *Project [dst#8]
         +- Scan ExistingRDD[src#7,dst#8,relationship#9]


In [11]:
myInDegrees = g.edges.groupBy('dst').count()\
               .withColumnRenamed('dst', 'id').withColumnRenamed('count', 'inDegree')
myInDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       2|
|  d|       1|
|  c|       2|
|  b|       2|
|  a|       1|
+---+--------+



In [12]:
myInDegrees.explain()

== Physical Plan ==
*HashAggregate(keys=[dst#8], functions=[count(1)])
+- Exchange hashpartitioning(dst#8, 200)
   +- *HashAggregate(keys=[dst#8], functions=[partial_count(1)])
      +- *Project [dst#8]
         +- Scan ExistingRDD[src#7,dst#8,relationship#9]


In [6]:
print g.inDegrees.storageLevel

Serialized 1x Replicated


In [7]:
g.inDegrees.cache()

DataFrame[id: string, inDegree: int]

In [8]:
print g.inDegrees.storageLevel

Disk Memory Deserialized 1x Replicated


In [9]:
print g.vertices.storageLevel

Serialized 1x Replicated


In [11]:
g.cache()

GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

In [12]:
print g.vertices.storageLevel
print g.edges.storageLevel

Disk Memory Deserialized 1x Replicated
Disk Memory Deserialized 1x Replicated


In [19]:
# A triplet view of the graph

g.triplets.show()

+--------------+------------+--------------+
|           src|        edge|           dst|
+--------------+------------+--------------+
| [e,Esther,32]|[e,f,follow]|  [f,Fanny,38]|
|  [g,Gabby,60]|[g,e,follow]| [e,Esther,32]|
|  [a,Alice,34]|[a,e,friend]| [e,Esther,32]|
| [e,Esther,32]|[e,d,friend]|  [d,David,29]|
|  [f,Fanny,38]|[f,c,follow]|[c,Charlie,37]|
|    [b,Bob,36]|[b,c,follow]|[c,Charlie,37]|
|[c,Charlie,37]|[c,b,follow]|    [b,Bob,36]|
|  [a,Alice,34]|[a,b,friend]|    [b,Bob,36]|
|  [d,David,29]|[d,a,friend]|  [a,Alice,34]|
+--------------+------------+--------------+



### Motif Finding

In [17]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[]->(b); (b)-[]->(a)")#.filter('a.id < b.id')
motifs.show()

+----------+--------------+
|         a|             b|
+----------+--------------+
|[b,Bob,36]|[c,Charlie,37]|
+----------+--------------+



In [18]:
# Find triangles

triangles = g.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)")
triangles = triangles.filter("a.id < b.id AND a.id < c.id")
triangles.show()

+------------+-------------+------------+
|           a|            b|           c|
+------------+-------------+------------+
|[a,Alice,34]|[e,Esther,32]|[d,David,29]|
+------------+-------------+------------+



In [22]:
triangles.explain()

== Physical Plan ==
*Project [a#379, b#382, c#418]
+- *BroadcastHashJoin [c#418.id, a#379.id], [__tmp-6526019406657860729#458.src, __tmp-6526019406657860729#458.dst], Inner, BuildRight
   :- *Project [a#379, b#382, c#418]
   :  +- *BroadcastHashJoin [__tmp-430217833014886237#415.dst], [c#418.id], Inner, BuildRight, (a#379.id < c#418.id)
   :     :- *BroadcastHashJoin [b#382.id], [__tmp-430217833014886237#415.src], Inner, BuildRight
   :     :  :- *Project [a#379, b#382]
   :     :  :  +- *BroadcastHashJoin [__tmp-1043886091038848698#376.dst], [b#382.id], Inner, BuildRight, (a#379.id < b#382.id)
   :     :  :     :- *BroadcastHashJoin [__tmp-1043886091038848698#376.src], [a#379.id], Inner, BuildRight
   :     :  :     :  :- *Project [named_struct(src, src#7, dst, dst#8, relationship, relationship#9) AS __tmp-1043886091038848698#376]
   :     :  :     :  :  +- InMemoryTableScan [src#7, dst#8, relationship#9]
   :     :  :     :  :        +- InMemoryRelation [src#7, dst#8, relationship#9]

In [23]:
# Negation
oneway = g.find("(a)-[]->(b); !(b)-[]->(a)")
oneway.show()

+-------------+--------------+
|            a|             b|
+-------------+--------------+
| [a,Alice,34]| [e,Esther,32]|
|[e,Esther,32]|  [d,David,29]|
| [a,Alice,34]|    [b,Bob,36]|
| [g,Gabby,60]| [e,Esther,32]|
|[e,Esther,32]|  [f,Fanny,38]|
| [f,Fanny,38]|[c,Charlie,37]|
| [d,David,29]|  [a,Alice,34]|
+-------------+--------------+



In [24]:
# Find vertices without incoming edges. This is wrong:
g.find('!()-[]->(a)').show()
# Because negation is implemented as a subtraction

Name: org.apache.toree.interpreter.broker.BrokerException
Message: Py4JJavaError: An error occurred while calling o64.find.
: org.graphframes.InvalidPatternException
	at org.graphframes.GraphFrame$.org$graphframes$GraphFrame$$findIncremental(GraphFrame.scala:892)
	at org.graphframes.GraphFrame.findSimple(GraphFrame.scala:441)
	at org.graphframes.GraphFrame.find(GraphFrame.scala:316)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.Gatewa

In [19]:
# Still doesn't work:
g.vertices.join(g.inDegrees, 'id').filter('inDegree=0').show()

+---+----+---+--------+
| id|name|age|inDegree|
+---+----+---+--------+
+---+----+---+--------+



In [26]:
# Why? Because inDegree is computed by a groupBy followed by a count
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       2|
|  d|       1|
|  c|       2|
|  b|       2|
|  a|       1|
+---+--------+



In [24]:
# Correct way:
g.vertices.join(g.inDegrees, 'id', 'left_outer').filter('inDegree is null').show()

+---+-----+---+--------+
| id| name|age|inDegree|
+---+-----+---+--------+
|  g|Gabby| 60|    null|
+---+-----+---+--------+



In [28]:
# Or use subtract:
g.vertices.select('id').subtract(g.inDegrees.select('id')).join(g.vertices,'id').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  g|Gabby| 60|
+---+-----+---+



In [29]:
# More meaningful queries can be expressed by applying filters.
# Question: where is this filter applied?

g.find("(a)-[]->(b); (b)-[]->(a)").filter("b.age > 36").show()

+----------+--------------+
|         a|             b|
+----------+--------------+
|[b,Bob,36]|[c,Charlie,37]|
+----------+--------------+



In [27]:
g.find("(a)-[]->(b); (b)-[]->(a)").filter("b.age > 36").explain()

== Physical Plan ==
*Project [a#1523, b#1526]
+- *SortMergeJoin [b#1526.id, a#1523.id], [__tmp4002724160828027973#1559.src, __tmp4002724160828027973#1559.dst], Inner
   :- *Sort [b#1526.id ASC NULLS FIRST, a#1523.id ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(b#1526.id, a#1523.id, 200)
   :     +- *Project [a#1523, b#1526]
   :        +- *SortMergeJoin [__tmp-8883747752747123924#1520.dst], [b#1526.id], Inner
   :           :- *Sort [__tmp-8883747752747123924#1520.dst ASC NULLS FIRST], false, 0
   :           :  +- Exchange hashpartitioning(__tmp-8883747752747123924#1520.dst, 200)
   :           :     +- *SortMergeJoin [__tmp-8883747752747123924#1520.src], [a#1523.id], Inner
   :           :        :- *Sort [__tmp-8883747752747123924#1520.src ASC NULLS FIRST], false, 0
   :           :        :  +- Exchange hashpartitioning(__tmp-8883747752747123924#1520.src, 200)
   :           :        :     +- *Project [named_struct(src, src#7, dst, dst#8, relationship, relationship

In [21]:
# Find chains of 4 vertices such that at least 2 of the 3 edges are "friend" relationships.
# The when function is similar to the CASE WHEN in SQL

chain4 = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(d)").where('a!=d')

friendTo1 = lambda e: when(e['relationship'] == 'friend', 1).otherwise(0)

chain4.select('*',friendTo1(chain4['e1']).alias('f1'), \
                  friendTo1(chain4['e2']).alias('f2'), \
                  friendTo1(chain4['e3']).alias('f3')) \
      .where('f1 + f2 + f3 >= 2').select('a', 'b', 'c', 'd').show()

+-------------+-------------+-------------+--------------+
|            a|            b|            c|             d|
+-------------+-------------+-------------+--------------+
|[e,Esther,32]| [d,David,29]| [a,Alice,34]|    [b,Bob,36]|
| [d,David,29]| [a,Alice,34]|[e,Esther,32]|  [f,Fanny,38]|
| [d,David,29]| [a,Alice,34]|   [b,Bob,36]|[c,Charlie,37]|
| [g,Gabby,60]|[e,Esther,32]| [d,David,29]|  [a,Alice,34]|
+-------------+-------------+-------------+--------------+



### Subgraphs

In [22]:
# Select subgraph of users older than 30, and edges of type "friend"
v2 = g.vertices.filter("age > 30")
e2 = g.edges.filter("relationship = 'friend'")
g2 = GraphFrame(v2, e2)
g2.vertices.show()
g2.edges.show()

# GraphFrames does not check if a vertex is isolated (which is OK)
# or if an edge connects two existing vertices (which could cause bugs)

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 37|
|  e| Esther| 32|
|  f|  Fanny| 38|
|  g|  Gabby| 60|
+---+-------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [33]:
g2.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  e|       1|
|  d|       1|
|  b|       1|
|  a|       1|
+---+--------+



In [34]:
# Only keeping edges that connect existing vertices
e3 = e2.join(v2, e2['src'] == v2['id'], 'left_semi') \
       .join(v2, e2['dst'] == v2['id'], 'left_semi') 
g3 = GraphFrame(v2, e3)

In [35]:
g3.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  a|  e|      friend|
+---+---+------------+



In [33]:
# Select subgraph based on edges of type "follow"
# pointing from an older user to an youner user.
e4 = g.find("(a)-[e]->(b)")\
        .filter("e.relationship = 'follow'")\
        .filter("a.age > b.age") \
        .select("e.*")
e4.show()

# Only keeping vertices that appear in the edges
v4 = g.vertices.join(e4, g.vertices['id'] == e4['src'], 'leftsemi') \
      .union(g.vertices.join(e4, g.vertices['id'] == e4['dst'], 'leftsemi')) \
      .distinct()
    
# Construct the subgraph
g4 = GraphFrame(v4, e4)
g4.vertices.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  g|  e|      follow|
|  f|  c|      follow|
|  c|  b|      follow|
+---+---+------------+



In [37]:
g4.triplets.show()

+--------------+------------+--------------+
|           src|        edge|           dst|
+--------------+------------+--------------+
|[c,Charlie,37]|[c,b,follow]|    [b,Bob,36]|
|  [f,Fanny,38]|[f,c,follow]|[c,Charlie,37]|
|  [g,Gabby,60]|[g,e,follow]| [e,Esther,32]|
+--------------+------------+--------------+



### BFS

In [38]:
# Starting vertex is 'a'
layers = [g.vertices.select('id').where("id = 'a'")]
visited =  layers[0]

while layers[-1].count() > 0:
    # From the current layer, get all the one-hop neighbors
    d1 = layers[-1].join(g.edges, layers[-1]['id'] == g.edges['src'])
    # Rename the column as 'id', and remove visited verices and duplicates
    d2 = d1.select(d1['dst'].alias('id')) \
           .subtract(visited).distinct()
    layers += [d2]
    visited = visited.union(layers[-1])

In [39]:
layers[0].show()

+---+
| id|
+---+
|  a|
+---+



In [40]:
layers[1].show()

+---+
| id|
+---+
|  e|
|  b|
+---+



In [68]:
layers[2].show()

+---+
| id|
+---+
|  f|
|  d|
|  c|
+---+



In [69]:
layers[3].show()

+---+
| id|
+---+
+---+



In [50]:
# GraphFrames provides own BFS:

paths = g.bfs("id = 'a'", "age > 36")
paths.show()

+------------+------------+-------------+------------+--------------+
|        from|          e0|           v1|          e1|            to|
+------------+------------+-------------+------------+--------------+
|[a,Alice,34]|[a,e,friend]|[e,Esther,32]|[e,f,follow]|  [f,Fanny,38]|
|[a,Alice,34]|[a,b,friend]|   [b,Bob,36]|[b,c,follow]|[c,Charlie,37]|
+------------+------------+-------------+------------+--------------+



### List Ranking

In [34]:
# -1 denotes end of list
data = [(0, 5), (1, 0), (3, 4), (4, 6), (5, -1), (6,1)]
e = spark.createDataFrame(data, ['src', 'dst'])
v = e.select(col('src').alias('id'), when(e.dst == -1, 0).otherwise(1).alias('d'))
v1 = spark.createDataFrame([(-1, 0)], ['id', 'd'])
v = v.union(v1)
v.show()
e.show()

+---+---+
| id|  d|
+---+---+
|  0|  1|
|  1|  1|
|  3|  1|
|  4|  1|
|  5|  0|
|  6|  1|
| -1|  0|
+---+---+

+---+---+
|src|dst|
+---+---+
|  0|  5|
|  1|  0|
|  3|  4|
|  4|  6|
|  5| -1|
|  6|  1|
+---+---+



In [35]:
while e.filter('dst != -1').count() > 0:
    g = GraphFrame(v, e)
    g.cache()
    v = g.triplets.select(col('src.id').alias('id'), 
                          (col('src.d') + col('dst.d')).alias('d')) \
         .union(v1)
    e = g.find('(a)-[]->(b); (b)-[]->(c)') \
         .select(col('a.id').alias('src'), col('c.id').alias('dst')) \
         .union(e.filter('dst = -1'))
v.show()

+---+---+
| id|  d|
+---+---+
|  1|  2|
|  3|  5|
|  4|  4|
|  6|  3|
|  0|  1|
|  5|  0|
| -1|  0|
+---+---+

