# Spark in Action - Chapter 2 Python Version

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os

In [None]:
current_dir = os.getcwd() #os.path.dirname(__file__)
relative_path = "net.jgp.books.spark.ch02/data/authors.csv"
absolute_file_path = os.path.join(current_dir, relative_path)

In [None]:
absolute_file_path

'/Users/development/ml/Spark/net.jgp.books.spark.ch02/data/authors.csv'

In [None]:
# Creates a session on a local master
spark = SparkSession.builder.appName("CSV to DB").master("local").config("spark.jars","{}/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd())).config("spark.driver.extraClassPath","{}/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd())).getOrCreate()

22/10/22 00:04:37 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
22/10/22 00:04:37 INFO SharedState: Warehouse path is 'file:/Users/development/ml/Spark/spark-warehouse'.
22/10/22 00:04:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
#  Step 1: Ingestion
#  ---------
#
#  Reads a CSV file with header, called authors.csv, stores it in a dataframe
df = spark.read.csv(header=True, inferSchema=True, path=absolute_file_path)

22/10/22 00:04:38 INFO InMemoryFileIndex: It took 31 ms to list leaf files for 1 paths.
22/10/22 00:04:38 INFO InMemoryFileIndex: It took 1 ms to list leaf files for 1 paths.
22/10/22 00:04:41 INFO FileSourceStrategy: Pushed Filters: 
22/10/22 00:04:41 INFO FileSourceStrategy: Post-Scan Filters: (length(trim(value#0, None)) > 0)
22/10/22 00:04:41 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
22/10/22 00:04:41 INFO CodeGenerator: Code generated in 210.059255 ms
22/10/22 00:04:42 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 353.3 KiB, free 366.0 MiB)
22/10/22 00:04:42 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 34.2 KiB, free 365.9 MiB)
22/10/22 00:04:42 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.1.41:56749 (size: 34.2 KiB, free: 366.3 MiB)
22/10/22 00:04:42 INFO SparkContext: Created broadcast 0 from csv at NativeMethodAccessorImpl.java:0
22/10/22 00:04:42 INFO FileS

In [None]:
df.show()

22/10/22 00:04:43 INFO FileSourceStrategy: Pushed Filters: 
22/10/22 00:04:43 INFO FileSourceStrategy: Post-Scan Filters: 
22/10/22 00:04:43 INFO FileSourceStrategy: Output Data Schema: struct<lname: string, fname: string>
22/10/22 00:04:43 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 353.1 KiB, free 365.2 MiB)
22/10/22 00:04:43 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 34.1 KiB, free 365.1 MiB)
22/10/22 00:04:43 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 192.168.1.41:56749 (size: 34.1 KiB, free: 366.2 MiB)
22/10/22 00:04:43 INFO SparkContext: Created broadcast 4 from showString at NativeMethodAccessorImpl.java:0
22/10/22 00:04:43 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
22/10/22 00:04:43 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
22/10/22 00:04:43 INFO DAGScheduler:

In [None]:
# Step 2: Transform
# ---------
# Creates a new column called "name" as the concatenation of lname, a
# virtual column containing ", " and the fname column
df = df.withColumn("name", F.concat(F.col("lname"), F.lit(", "), F.col("fname")))

In [None]:
df.show()

22/10/22 00:04:43 INFO FileSourceStrategy: Pushed Filters: 
22/10/22 00:04:43 INFO FileSourceStrategy: Post-Scan Filters: 
22/10/22 00:04:43 INFO FileSourceStrategy: Output Data Schema: struct<lname: string, fname: string>
22/10/22 00:04:43 INFO CodeGenerator: Code generated in 16.18374 ms
22/10/22 00:04:43 INFO MemoryStore: Block broadcast_6 stored as values in memory (estimated size 353.1 KiB, free 364.8 MiB)
22/10/22 00:04:43 INFO MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 34.1 KiB, free 364.7 MiB)
22/10/22 00:04:43 INFO BlockManagerInfo: Added broadcast_6_piece0 in memory on 192.168.1.41:56749 (size: 34.1 KiB, free: 366.1 MiB)
22/10/22 00:04:43 INFO SparkContext: Created broadcast 6 from showString at NativeMethodAccessorImpl.java:0
22/10/22 00:04:43 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
22/10/22 00:04:43 INFO SparkContext: Starting job: showString at 

In [None]:
# Step 3: Save
# ----
#
# The connection URL, assuming your PostgreSQL instance runs locally on the
# default port, and the database we use is "spark_labs"
dbConnectionUrl = "jdbc:sqlite:/Users/development/ml/Spark/net.jgp.books.spark.ch02/data/spark_labs.db"

In [None]:
# Properties to connect to the database, the JDBC driver is part of our pom.xml
prop = {"driver":"org.sqlite.JDBC", "user":"jgp", "password":"Spark<3Java"}

In [None]:
# Write in a table called ch02
df.write.mode("overwrite").jdbc(url=dbConnectionUrl, table="ch02", properties=prop)


22/10/22 00:04:44 INFO FileSourceStrategy: Pushed Filters: 
22/10/22 00:04:44 INFO FileSourceStrategy: Post-Scan Filters: 
22/10/22 00:04:44 INFO FileSourceStrategy: Output Data Schema: struct<lname: string, fname: string>
22/10/22 00:04:44 INFO MemoryStore: Block broadcast_8 stored as values in memory (estimated size 353.1 KiB, free 364.4 MiB)
22/10/22 00:04:44 INFO MemoryStore: Block broadcast_8_piece0 stored as bytes in memory (estimated size 34.1 KiB, free 364.3 MiB)
22/10/22 00:04:44 INFO BlockManagerInfo: Added broadcast_8_piece0 in memory on 192.168.1.41:56749 (size: 34.1 KiB, free: 366.1 MiB)
22/10/22 00:04:44 INFO SparkContext: Created broadcast 8 from jdbc at NativeMethodAccessorImpl.java:0
22/10/22 00:04:44 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 bytes, open cost is considered as scanning 4194304 bytes.
22/10/22 00:04:44 INFO SparkContext: Starting job: jdbc at NativeMethodAccessorImpl.java:0
22/10/22 00:04:44 INFO DAGScheduler: Got job 4 (

In [None]:
# Good to stop SparkSession at the end of the application
spark.stop()

22/10/22 00:04:44 INFO SparkUI: Stopped Spark web UI at http://192.168.1.41:4040
22/10/22 00:04:44 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
22/10/22 00:04:44 INFO MemoryStore: MemoryStore cleared
22/10/22 00:04:44 INFO BlockManager: BlockManager stopped
22/10/22 00:04:44 INFO BlockManagerMaster: BlockManagerMaster stopped
22/10/22 00:04:44 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
22/10/22 00:04:44 INFO SparkContext: Successfully stopped SparkContext


## Método alternativo según Manual Spark

In [None]:
import sqlite3

con = sqlite3.connect('example.db')
cur = con.cursor()
# Create table
cur.execute(
    '''CREATE TABLE stocks
       (date text, trans text, symbol text, qty real, price real)''')
# Insert a row of data
cur.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
# Save (commit) the changes
con.commit()
con.close()

In [None]:
import os

from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .master("local")
    .appName("SQLite JDBC")
    .config(
        "spark.jars",
        "/opt/apache-spark/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd()))
    .config(
        "spark.driver.extraClassPath",
        "/opt/apache-spark/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd()))
    .getOrCreate())

22/10/22 00:04:44 INFO SparkContext: Running Spark version 3.3.0
22/10/22 00:04:44 INFO ResourceUtils: No custom resources configured for spark.driver.
22/10/22 00:04:44 INFO SparkContext: Submitted application: SQLite JDBC
22/10/22 00:04:44 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
22/10/22 00:04:44 INFO ResourceProfile: Limiting resource is cpu
22/10/22 00:04:44 INFO ResourceProfileManager: Added ResourceProfile id: 0
22/10/22 00:04:44 INFO SecurityManager: Changing view acls to: toni
22/10/22 00:04:44 INFO SecurityManager: Changing modify acls to: toni
22/10/22 00:04:44 INFO SecurityManager: Changing view acls groups to: 
22/10/22 00:04:44 INFO SecurityManager: Changing modify acls groups to: 
22/10/22 00:04:44 INFO SecurityMana

In [None]:
import pyspark.pandas as ps

df = ps.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))
df



22/10/22 00:04:45 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
22/10/22 00:04:45 INFO SharedState: Warehouse path is 'file:/Users/development/ml/Spark/spark-warehouse'.
22/10/22 00:04:46 INFO CodeGenerator: Code generated in 13.558244 ms
22/10/22 00:04:46 INFO SparkContext: Starting job: __repr__ at /Users/development/mambaforge/envs/fastai/lib/python3.9/site-packages/IPython/lib/pretty.py:778
22/10/22 00:04:46 INFO DAGScheduler: Got job 0 (__repr__ at /Users/development/mambaforge/envs/fastai/lib/python3.9/site-packages/IPython/lib/pretty.py:778) with 1 output partitions
22/10/22 00:04:46 INFO DAGScheduler: Final stage: ResultStage 0 (__repr__ at /Users/development/mambaforge/envs/fastai/lib/python3.9/site-packages/IPython/lib/pretty.py:778)
22/10/22 00:04:46 INFO DAGScheduler: Parents of final stage: List()
22/10/22 00:04:46 INFO DAGScheduler: Missing parents: List()
22/10/22 00:04:46 INFO DAGScheduler: Submitting ResultStag

Unnamed: 0,date,trans,symbol,qty,price
0,2006-01-05,BUY,RHAT,100.0,35.14


In [None]:
df.price += 1
df.spark.to_spark_io(
    format="jdbc", mode="append",
    dbtable="stocks", url="jdbc:sqlite:{}/example.db".format(os.getcwd()))
ps.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))

22/10/22 00:04:46 INFO CodeGenerator: Code generated in 14.458876 ms
22/10/22 00:04:46 INFO SparkContext: Starting job: save at NativeMethodAccessorImpl.java:0
22/10/22 00:04:46 INFO DAGScheduler: Got job 1 (save at NativeMethodAccessorImpl.java:0) with 1 output partitions
22/10/22 00:04:46 INFO DAGScheduler: Final stage: ResultStage 1 (save at NativeMethodAccessorImpl.java:0)
22/10/22 00:04:46 INFO DAGScheduler: Parents of final stage: List()
22/10/22 00:04:46 INFO DAGScheduler: Missing parents: List()
22/10/22 00:04:46 INFO DAGScheduler: Submitting ResultStage 1 (MapPartitionsRDD[11] at save at NativeMethodAccessorImpl.java:0), which has no missing parents
22/10/22 00:04:46 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 27.5 KiB, free 366.3 MiB)
22/10/22 00:04:46 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 12.8 KiB, free 366.2 MiB)
22/10/22 00:04:46 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on 19

Unnamed: 0,date,trans,symbol,qty,price
0,2006-01-05,BUY,RHAT,100.0,35.14
1,2006-01-05,BUY,RHAT,100.0,36.14


In [None]:
spark.stop()

22/10/22 00:04:46 INFO SparkUI: Stopped Spark web UI at http://192.168.1.41:4040
22/10/22 00:04:46 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
22/10/22 00:04:46 INFO MemoryStore: MemoryStore cleared
22/10/22 00:04:46 INFO BlockManager: BlockManager stopped
22/10/22 00:04:46 INFO BlockManagerMaster: BlockManagerMaster stopped
22/10/22 00:04:46 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
22/10/22 00:04:46 INFO SparkContext: Successfully stopped SparkContext
