In [1]:
spark

In [2]:
spark.conf.set("viewsEnabled","true")
spark.conf.set("materializationDataset","<dataset>")

eth_tx = spark.read.format("bigquery")\
    .option('table', 'bigquery-public-data:crypto_ethereum.transactions') \
    .load()

eth_tx.createOrReplaceTempView('eth_tx')


In [3]:
sql_hash = """
SELECT hash FROM eth_tx WHERE DATE(block_timestamp) = "2022-01-11" LIMIT 1000
"""
tx_hash = spark.sql(sql_hash)

In [4]:
tx_hash.show()

+--------------------+
|                hash|
+--------------------+
|0xa392ab203bfc025...|
|0xa9ffd3c88c67cec...|
|0x068a6584497429a...|
|0x43a43b3bc920254...|
|0xb921f7ac46897de...|
|0xb36bc7f3e4ca4e2...|
|0xe31ec55171f80b7...|
|0xd363efa73dab281...|
|0x2707edb4935d38c...|
|0xc3f71ab9555b100...|
|0x17d501ef2c699ad...|
|0x88b5da108c1267b...|
|0x0728af5c865cfe6...|
|0xedeb629ecfbb90a...|
|0xe1ee7be81df3619...|
|0xb33037cdb96ebf9...|
|0x5cba1e9891cd5a9...|
|0x3063d535cab1cf9...|
|0x0cc9b161e6b87cb...|
|0xa51aa88997dc02d...|
+--------------------+
only showing top 20 rows



In [5]:
tx_hash.printSchema()

root
 |-- hash: string (nullable = false)



In [4]:
from graphframes import GraphFrame

In [3]:
!pip install graphframes

Collecting graphframes
  Downloading https://files.pythonhosted.org/packages/0b/27/c7c7e1ced2fe9a905f865dd91faaec2ac8a8e313f511678c8ec92a41a153/graphframes-0.6-py2.py3-none-any.whl
Installing collected packages: graphframes
Successfully installed graphframes-0.6


In [5]:
from graphframes import GraphFrame
from graphframes.examples import Graphs


In [6]:
vertices = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)], ["id", "name", "age"])


In [7]:
edges = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])


In [8]:
g = GraphFrame(vertices, edges)


In [10]:
print(g)


GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])


In [11]:
from graphframes.examples import Graphs
same_g = Graphs(spark).friends()
print(same_g)


GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])


In [13]:
g.vertices.show()


+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
|  g|  Gabby| 60|
+---+-------+---+



In [14]:
g.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  f|  c|      follow|
|  e|  f|      follow|
|  e|  d|      friend|
|  d|  a|      friend|
|  a|  e|      friend|
+---+---+------------+



In [15]:
g.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  f|       1|
|  e|       1|
|  d|       1|
|  c|       2|
|  b|       2|
|  a|       1|
+---+--------+



In [17]:
result = g.stronglyConnectedComponents(maxIter=10)
result.select("id", "component").show()


+---+-------------+
| id|    component|
+---+-------------+
|  g| 146028888064|
|  b|1047972020224|
|  e| 670014898176|
|  a| 670014898176|
|  f| 412316860416|
|  d| 670014898176|
|  c|1047972020224|
+---+-------------+

