# Spark with Postgres in Jupyter Notebooks

There are several things that need special attention.

- Configure `"spark.jars.packages", "org.postgresql:postgresql:42.7.4"` in order to download the Postgres JDBC driver artifact.
- Load the `sql` extension installed using `%load_ext sql` in order to be able to run SQL queries in a cell magic using `%%sql`.

In [None]:
%load_ext sql

In [None]:
%%bash
docker run --name jupyter_postgres -p 5432:5432 -e POSTGRES_PASSWORD=secret -d postgres

In [None]:
%%bash
docker run --name jupyter_postgres -p 5432:5432 -e POSTGRES_PASSWORD=secret -d postgres

In [None]:
%sql postgresql://postgres:secret@localhost:5432/postgres

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window

spark = (
    SparkSession.builder 
    .appName("PostgresExample")
    .master("local[*]")
    .config("spark.ui.enabled", "true")   
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.4")
    .getOrCreate()
)

# Show the SparkUI url (useful for monitoring and debuging)
spark.sparkContext.uiWebUrl

In [None]:
schema = T.StructType([
    T.StructField("id", T.IntegerType(), False),
    T.StructField("hero_name", T.StringType(), False),
    T.StructField("secret_identity", T.StringType(), False),
    T.StructField("power_level", T.IntegerType(), False)
])

In [None]:
raw_df = (
    spark.read
        .schema(schema)
        .option("header", "true")
        .csv("data/marvel.csv")
)

raw_df.createOrReplaceTempView("superheroes_raw")

In [None]:
(
    raw_df.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://localhost:5432/postgres") \
        .option("dbtable", "public.superheroes") \
        .option("user", "postgres") \
        .option("password", "secret") \
        .option("driver", "org.postgresql.Driver") \
        .mode("overwrite")
        .save()
)

In [None]:
%%sql
select * from superheroes;

In [None]:
result = %sql select * from superheroes;

In [None]:
result[0].hero_name