In [3]:
# The Python packaging for Spark is not intended to replace all of the other use cases. 
# This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. 
# You can download the full version of Spark from the Apache Spark downloads page.
!pip3 install pyspark==3.0.0
!pip3 install psycopg2-binary==2.8.5

Processing /Users/pmacharl/Library/Caches/pip/wheels/60/74/91/22826adce98cd838d5258b7a0b245d5369a5164d42852d9a36/pyspark-3.0.0-py2.py3-none-any.whl
Collecting py4j==0.10.9
  Using cached py4j-0.10.9-py2.py3-none-any.whl (198 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.0
You should consider upgrading via the '/usr/local/opt/python@3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [None]:
# Below variables are to be set in the shell profile
# export SPARK_HOME=/Users/pmacharl/spark-2.4.4-bin-hadoop2.7
# export PATH=$PATH:$SPARK_HOME/bin
# export PYSPARK_SUBMIT_ARGS="pyspark-shell"
# export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3
# export PYSPARK_PYTHON=/usr/local/bin/python3

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")

#Because you are likely running in local mode, it is a good practice to set the number of shuffle partitions
# to something that is going to fit local mode. By default, the value is 200, but there aren't many executors
# on this machine, its worth reducing this to 5
config.set("spark.sql.shuffle.partitions", "5")

# Cluster mode
# https://spark.apache.org/docs/latest/submitting-applications.html
config.setMaster("spark://192.168.0.4:7077") # If spark is started in local cluster mode

<pyspark.conf.SparkConf at 0x10e4b9370>

In [2]:
spark = SparkSession.builder.config(conf=config).master("local").appName("MyApp").getOrCreate()
spark

- In the simplest form, the default data source (parquet unless otherwise configured by spark.sql.sources.default) will be used for all operations

In [3]:
df = spark.read.load("./data/height_weight.parquet")
df.show()

+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
+---+------+------+-----+-----+
|  M|    77|   182| 77.0|180.0|
|  F|    58|   161| 51.0|159.0|
|  F|    53|   161| 54.0|158.0|
|  M|    68|   177| 70.0|175.0|
|  F|    59|   157| 59.0|155.0|
|  M|    76|   170| 76.0|165.0|
|  M|    76|   167| 77.0|165.0|
|  M|    69|   186| 73.0|180.0|
|  M|    71|   178| 71.0|175.0|
|  M|    65|   171| 64.0|170.0|
|  M|    70|   175| 75.0|174.0|
|  F|   166|    57| 56.0|163.0|
|  F|    51|   161| 52.0|158.0|
|  F|    64|   168| 64.0|165.0|
|  F|    52|   163| 57.0|160.0|
|  F|    65|   166| 66.0|165.0|
|  M|    92|   187|101.0|185.0|
|  F|    62|   168| 62.0|165.0|
|  M|    76|   197| 75.0|200.0|
|  F|    61|   175| 61.0|171.0|
+---+------+------+-----+-----+
only showing top 20 rows



# Preferred way is to be explicit
- You can also manually specify the data source that will be used along with any extra options that you would like to pass to the data source. Data sources are specified by their fully qualified name (i.e., org.apache.spark.sql.parquet), but for built-in sources you can also use their short names (json, parquet, jdbc, orc, libsvm, csv, text). DataFrames loaded from any data source type can be converted into other types using this syntax

In [4]:
df = spark.read.load("./data/height_weight.parquet", format="parquet")
df.show()

+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
+---+------+------+-----+-----+
|  M|    77|   182| 77.0|180.0|
|  F|    58|   161| 51.0|159.0|
|  F|    53|   161| 54.0|158.0|
|  M|    68|   177| 70.0|175.0|
|  F|    59|   157| 59.0|155.0|
|  M|    76|   170| 76.0|165.0|
|  M|    76|   167| 77.0|165.0|
|  M|    69|   186| 73.0|180.0|
|  M|    71|   178| 71.0|175.0|
|  M|    65|   171| 64.0|170.0|
|  M|    70|   175| 75.0|174.0|
|  F|   166|    57| 56.0|163.0|
|  F|    51|   161| 52.0|158.0|
|  F|    64|   168| 64.0|165.0|
|  F|    52|   163| 57.0|160.0|
|  F|    65|   166| 66.0|165.0|
|  M|    92|   187|101.0|185.0|
|  F|    62|   168| 62.0|165.0|
|  M|    76|   197| 75.0|200.0|
|  F|    61|   175| 61.0|171.0|
+---+------+------+-----+-----+
only showing top 20 rows



# Read csv/text

In [5]:
df = spark.read.load("../height_weight.csv",
                     format="csv", sep=",", inferSchema="true", header="true")
df.show()

+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
+---+------+------+-----+-----+
|  M|    77|   182|   77|  180|
|  F|    58|   161|   51|  159|
|  F|    53|   161|   54|  158|
|  M|    68|   177|   70|  175|
|  F|    59|   157|   59|  155|
|  M|    76|   170|   76|  165|
|  M|    76|   167|   77|  165|
|  M|    69|   186|   73|  180|
|  M|    71|   178|   71|  175|
|  M|    65|   171|   64|  170|
|  M|    70|   175|   75|  174|
|  F|   166|    57|   56|  163|
|  F|    51|   161|   52|  158|
|  F|    64|   168|   64|  165|
|  F|    52|   163|   57|  160|
|  F|    65|   166|   66|  165|
|  M|    92|   187|  101|  185|
|  F|    62|   168|   62|  165|
|  M|    76|   197|   75|  200|
|  F|    61|   175|   61|  171|
+---+------+------+-----+-----+
only showing top 20 rows



# Read json

In [6]:
df = spark.read.load("./data/2015-summary.json",
                     format="json")
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

# Read excel
- Spark doesn't have in built support for excel
- Read using other libraries and convert to spark

In [8]:
# TypeError: field Category 3: Can not merge type <class 'pyspark.sql.types.StringType'> and <class 'pyspark.sql.types.DoubleType'>
# To address conversion of data types issues, we need to explicitly define schema
import pandas as pd
from pyspark.sql.types import *
pandas_df = pd.read_excel('./data/usa_email_sample_db.xlsx')
# spark.conf.set("spark.sql.execution.arrow.enabled", "false")

# https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/types.html
mySchema = StructType([ StructField("Business Name", StringType(), True)\
                      ,StructField("Email", StringType(), True)\
                      ,StructField("Category", StringType(), True)\
                      ,StructField("Category 2", StringType(), True)\
                      ,StructField("Category 3", StringType(), True)\
                      ,StructField("Address", StringType(), True)\
                      ,StructField("City", StringType(), True)\
                      ,StructField("State", StringType(), True)\
                      ,StructField("Postal", DoubleType(), True)\
                      ,StructField("Phone", StringType(), True)\
                      ,StructField("Website", StringType(), True)])

df = spark.createDataFrame(pandas_df, schema=mySchema)
df.show()

+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+----------------+-----+-------+--------------+--------------------+
|       Business Name|         Email|            Category|          Category 2|          Category 3|             Address|            City|State| Postal|         Phone|             Website|
+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+----------------+-----+-------+--------------+--------------------+
|Stone Cove Marina...| NOT IN SAMPLE|               Docks|             Marinas|       Dock Builders|   134 Salt Pond Rd |       Wakefield|   RI| 2879.0|(401) 783-8990|http://stonecovem...|
|     Bluehaven Homes| NOT IN SAMPLE| General Contractors|       Home Builders|                 NaN|       5701 Time Sq |        Amarillo|   TX|79119.0|(806) 452-2545|http://www.blueha...|
|Michael Jays Tatt...| NOT IN SAMPLE|            Jewele

# Read partitioned data

In [9]:
df = spark.read.load("./data/retailer",
                     format="parquet")
df.show()

+----------+------------+-------------+--------------------+--------------------+---------+--------------------+--------+-----------------+----+-----+---+
|ScrapeDate|Manufacturer|     Retailer|               Brand|          CleanBrand|FaceValue|        OfferDetails| Expires|__index_level_0__|year|month|day|
+----------+------------+-------------+--------------------+--------------------+---------+--------------------+--------+-----------------+----+-----+---+
|04/20/2020|          NA|DollarGeneral|      Dollar General|      Dollar General|     5.00|SAVE $5.00 when y...|04/25/20|                1|2020|    4| 20|
|04/20/2020|          NA|DollarGeneral|      Dollar General|      Dollar General|     2.00|SAVE $2.00 when y...|01/31/21|                2|2020|    4| 20|
|04/20/2020|          NA|DollarGeneral|                Gain|                Gain|     1.00|SAVE $1.00 on ONE...|04/25/20|                3|2020|    4| 20|
|04/20/2020|          NA|DollarGeneral|  TIDE LAUNDRY GROUP|  TIDE LAU

# Run SQL on files directly

In [10]:
df = spark.sql("SELECT * FROM parquet.`./data/height_weight.parquet`")
df.show()

+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
+---+------+------+-----+-----+
|  M|    77|   182| 77.0|180.0|
|  F|    58|   161| 51.0|159.0|
|  F|    53|   161| 54.0|158.0|
|  M|    68|   177| 70.0|175.0|
|  F|    59|   157| 59.0|155.0|
|  M|    76|   170| 76.0|165.0|
|  M|    76|   167| 77.0|165.0|
|  M|    69|   186| 73.0|180.0|
|  M|    71|   178| 71.0|175.0|
|  M|    65|   171| 64.0|170.0|
|  M|    70|   175| 75.0|174.0|
|  F|   166|    57| 56.0|163.0|
|  F|    51|   161| 52.0|158.0|
|  F|    64|   168| 64.0|165.0|
|  F|    52|   163| 57.0|160.0|
|  F|    65|   166| 66.0|165.0|
|  M|    92|   187|101.0|185.0|
|  F|    62|   168| 62.0|165.0|
|  M|    76|   197| 75.0|200.0|
|  F|    61|   175| 61.0|171.0|
+---+------+------+-----+-----+
only showing top 20 rows



In [11]:
# Not the most recommended way. Files are better read with previous API's.
# sql is used to read from tables (temp or global)
df = spark.sql("SELECT * FROM csv.`../height_weight.csv`")
df.show()

+---+------+------+-----+-----+
|_c0|   _c1|   _c2|  _c3|  _c4|
+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
|  M|    77|   182|   77|  180|
|  F|    58|   161|   51|  159|
|  F|    53|   161|   54|  158|
|  M|    68|   177|   70|  175|
|  F|    59|   157|   59|  155|
|  M|    76|   170|   76|  165|
|  M|    76|   167|   77|  165|
|  M|    69|   186|   73|  180|
|  M|    71|   178|   71|  175|
|  M|    65|   171|   64|  170|
|  M|    70|   175|   75|  174|
|  F|   166|    57|   56|  163|
|  F|    51|   161|   52|  158|
|  F|    64|   168|   64|  165|
|  F|    52|   163|   57|  160|
|  F|    65|   166|   66|  165|
|  M|    92|   187|  101|  185|
|  F|    62|   168|   62|  165|
|  M|    76|   197|   75|  200|
+---+------+------+-----+-----+
only showing top 20 rows



# Write to files

In [12]:
df = spark.sql("SELECT * FROM parquet.`./data/height_weight.parquet`")
df.write.format("csv").option("header", "true").save("height_weight.csv")
# df.write.format("csv").mode("overwrite").option("header", "true").save("height_weight.csv")
# https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html - All SaveModes - append, overwrite etc.

In [34]:
# delete the file
import shutil
shutil.rmtree('height_weight.csv')

# Save to Persistent Tables
- DataFrames can also be saved as persistent tables into Hive metastore using the saveAsTable command. 
- Notice that an existing Hive deployment is not necessary to use this feature. 
- Spark will create a default local Hive metastore (using Derby) for you. 
- Unlike the createOrReplaceTempView command, saveAsTable will materialize the contents of the DataFrame and create a pointer to the data in the Hive metastore. 
- Persistent tables will still exist even after your Spark program has restarted, as long as you maintain your connection to the same metastore. 
- A DataFrame for a persistent table can be created by calling the table method on a SparkSession with the name of the table.
- For file-based data source, e.g. text, parquet, json, etc. you can specify a custom table path via the path option, e.g. df.write.option("path", "/some/path").saveAsTable("t"). 
- When the table is dropped, the custom table path will not be removed and the table data is still there. 
- If no custom table path is specified, Spark will write data to a default table path under the warehouse directory. When the table is dropped, the default table path will be removed too.

In [35]:
# Tables are stored in parquet format. hive metastore gets created
df = spark.read.load("../height_weight.csv",
                     format="csv", sep=",", inferSchema="true", header="true")
df.write.option("path", "./apache_table").saveAsTable("height_weight")

In [None]:
spark.stop()

# Read a large file - Spark vs. Pandas

- Observe year_count.csv is not one file, but many output csv partitions
- Apache Spark power comes through distributed systems - many machines, many files, partitions, chunks

In [None]:
%%time
# os.environ['AWS_ACCESS_KEY_ID'] = 
# os.environ['AWS_SECRET_ACCESS_KEY'] = 
df = spark.read.load("../companies_sorted.csv", format="csv", inferSchema="true", header="true")
df_grouped = df.groupBy("year founded").agg({"name":"count"}) # .show() will only print few values. That is lazy loading
df_grouped.write.format("csv").option("header", "true").save("year_count.csv") # This will ensure entire processing is done

- Pandas is designed for single machine
- There is only one output file

In [29]:
%%time
import pandas as pd
df = pd.read_csv("../companies_sorted.csv")
df.groupby(["year founded","size range"]).agg(COUNT_BY_YEAR=("name", "count"))

CPU times: user 19.3 s, sys: 1.66 s, total: 20.9 s
Wall time: 21 s


Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT_BY_YEAR
year founded,size range,Unnamed: 2_level_1
1451.0,5001 - 10000,1
1670.0,1001 - 5000,1
1775.0,1 - 10,1
1775.0,11 - 50,1
1775.0,51 - 200,1
...,...,...
2023.0,11 - 50,1
2025.0,1 - 10,4
2027.0,1 - 10,1
2029.0,51 - 200,1


# Spark JDBC
- [Spark Docs](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html)
- Drop postgres jdbc jar in SPARK_HOME/jars
- Download jar from [here](https://jdbc.postgresql.org/download.html)
- For postgres 9.6, download [4.2](https://jdbc.postgresql.org/download/postgresql-42.2.12.jar)

# Sample Database in Class1
- Supports sqlalchemy
- Connection string: dialect+driver://username:password@host:port/database
- https://docs.sqlalchemy.org/en/13/core/engines.html
- Up the sample database. `cd class1_explore/postgres_sample_pagila && docker-compose up` 
- Use any sql client of your choice - dbeaver, dbvisualizer, pgAdmin etc.
- The ER diagram is in the class1_explore_postgres_sample_pagila folder

In [4]:
# Constants
import os
DB_USER = os.getenv('DB_USER','postgres')
DB_PASSWORD = os.getenv('DB_PASSWORD','postgres')
CONNECTION_STRING = "postgresql://{0}:{1}@localhost/pagila".format(DB_USER,DB_PASSWORD)

In [5]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")
spark = SparkSession.builder.config(conf=config).master("local").appName("MyApp").getOrCreate()
spark

# Cluster mode
# https://spark.apache.org/docs/latest/submitting-applications.html
# config.setMaster("spark://192.168.0.6:7077") # If spark is started in local cluster mode

In [6]:
# jdbc:postgresql://localhost/test?user=fred&password=secret
jdbcDF = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost/pagila") \
    .option("dbtable", "public.customer") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .load()

jdbcDF.show()

""" jdbcDF2 = spark.read \
    .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
          properties={"user": "username", "password": "password"})
"""

+-----------+--------+----------+---------+--------------------+----------+----------+-----------+-------------------+------+
|customer_id|store_id|first_name|last_name|               email|address_id|activebool|create_date|        last_update|active|
+-----------+--------+----------+---------+--------------------+----------+----------+-----------+-------------------+------+
|          1|       1|      MARY|    SMITH|MARY.SMITH@sakila...|         5|      true| 2017-02-14|2017-02-15 04:57:20|     1|
|          2|       1|  PATRICIA|  JOHNSON|PATRICIA.JOHNSON@...|         6|      true| 2017-02-14|2017-02-15 04:57:20|     1|
|          3|       1|     LINDA| WILLIAMS|LINDA.WILLIAMS@sa...|         7|      true| 2017-02-14|2017-02-15 04:57:20|     1|
|          4|       2|   BARBARA|    JONES|BARBARA.JONES@sak...|         8|      true| 2017-02-14|2017-02-15 04:57:20|     1|
|          5|       1| ELIZABETH|    BROWN|ELIZABETH.BROWN@s...|         9|      true| 2017-02-14|2017-02-15 04:57:20|

' jdbcDF2 = spark.read     .jdbc("jdbc:postgresql:dbserver", "schema.tablename",\n          properties={"user": "username", "password": "password"})\n'