In [1]:
import os
import sys

# Set explicit paths, important
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Set explicit paths
os.environ['SPARK_HOME'] = r"C:\Users\wood_\anaconda3\envs\spark\Lib\site-packages\pyspark"
os.environ['HADOOP_HOME'] = r"C:\Users\wood_\anaconda3\envs\spark\Lib\site-packages\pyspark"
sys.path.append(r"C:\Users\wood_\anaconda3\envs\spark\Lib\site-packages\pyspark\bin")

In [2]:
from graphframes import GraphFrame
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test.com") \
    .config("spark.jars.packages", f"graphframes:graphframes:0.8.2-spark3.2-s_2.12")\
    .getOrCreate()

In [3]:
# Define vertices DataFrame
vertices = spark.createDataFrame(
    [
        ("a", "Raman", 34),
        ("b", "Bob", 36),
        ("c", "Naveen", 30),
        ("d", "Kumar", 29)
    ],     
    ["id", "name", "age"],
)

In [4]:
# Define edges DataFrame
edges = spark.createDataFrame(
    [
        ("a", "b", "friend"),
        ("b", "c", "follow"),
        ("c", "d", "friend"),
        ("d", "a", "follow"),
    ],
    ["src", "dst", "relationship"],
)

In [5]:
# Create a GraphFrame
g = GraphFrame(vertices, edges)



In [6]:
# Query the graph
g.vertices.show()
g.edges.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  a| Raman| 34|
|  b|   Bob| 36|
|  c|Naveen| 30|
|  d| Kumar| 29|
+---+------+---+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  d|      friend|
|  d|  a|      follow|
+---+---+------------+



In [7]:
# Find the shortest path between two vertices
results = g.shortestPaths(landmarks=["a", "d"])
results.select("id", "distances").show()



+---+----------------+
| id|       distances|
+---+----------------+
|  d|{a -> 1, d -> 0}|
|  c|{a -> 2, d -> 1}|
|  b|{a -> 3, d -> 2}|
|  a|{a -> 0, d -> 3}|
+---+----------------+



In [9]:
# Stop the SparkSession
spark.stop()