# Spark in Action - Chapter 2 Python Version

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os

In [None]:
current_dir = os.getcwd() #os.path.dirname(__file__)
relative_path = "../net.jgp.books.spark.ch02/data/authors.csv"
absolute_file_path = os.path.join(current_dir, relative_path)

In [None]:
absolute_file_path

'/Users/development/ml/Spark/chapter2/../net.jgp.books.spark.ch02/data/authors.csv'

In [None]:
# Creates a session on a local master
spark = SparkSession.builder.appName("CSV to DB").master("local").config("spark.jars","{}/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd())).config("spark.driver.extraClassPath","{}/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd())).getOrCreate()

22/10/29 17:17:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
#  Step 1: Ingestion
#  ---------
#
#  Reads a CSV file with header, called authors.csv, stores it in a dataframe
df = spark.read.csv(header=True, inferSchema=True, path=absolute_file_path)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [None]:
df.show()

+--------+--------------+
|   lname|         fname|
+--------+--------------+
|  Pascal|        Blaise|
|Voltaire|      François|
|  Perrin|  Jean-Georges|
|Maréchal|Pierre Sylvain|
|   Karau|        Holden|
| Zaharia|         Matei|
+--------+--------------+



In [None]:
# Step 2: Transform
# ---------
# Creates a new column called "name" as the concatenation of lname, a
# virtual column containing ", " and the fname column
df = df.withColumn("name", F.concat(F.col("lname"), F.lit(", "), F.col("fname")))

In [None]:
df.show()

+--------+--------------+--------------------+
|   lname|         fname|                name|
+--------+--------------+--------------------+
|  Pascal|        Blaise|      Pascal, Blaise|
|Voltaire|      François|  Voltaire, François|
|  Perrin|  Jean-Georges|Perrin, Jean-Georges|
|Maréchal|Pierre Sylvain|Maréchal, Pierre ...|
|   Karau|        Holden|       Karau, Holden|
| Zaharia|         Matei|      Zaharia, Matei|
+--------+--------------+--------------------+



In [None]:
# Step 3: Save
# ----
#
# The connection URL, assuming your PostgreSQL instance runs locally on the
# default port, and the database we use is "spark_labs"
dbConnectionUrl = "jdbc:sqlite:/Users/development/ml/Spark/net.jgp.books.spark.ch02/data/spark_labs.db"

In [None]:
# Properties to connect to the database, the JDBC driver is part of our pom.xml
prop = {"driver":"org.sqlite.JDBC", "user":"jgp", "password":"Spark<3Java"}

In [None]:
# Write in a table called ch02
df.write.mode("overwrite").jdbc(url=dbConnectionUrl, table="ch02", properties=prop)


22/10/29 17:17:47 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8


In [None]:
# Good to stop SparkSession at the end of the application
spark.stop()

## Método alternativo según Manual Spark

In [None]:
import sqlite3

con = sqlite3.connect('example.db')
cur = con.cursor()
# Create table
cur.execute(
    '''CREATE TABLE stocks
       (date text, trans text, symbol text, qty real, price real)''')
# Insert a row of data
cur.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
# Save (commit) the changes
con.commit()
con.close()

In [None]:
import os

from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .master("local")
    .appName("SQLite JDBC")
    .config(
        "spark.jars",
        "/opt/apache-spark/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd()))
    .config(
        "spark.driver.extraClassPath",
        "/opt/apache-spark/jars/sqlite-jdbc-3.36.0.3.jar".format(os.getcwd()))
    .getOrCreate())

In [None]:
import pyspark.pandas as ps

df = ps.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))
df



Unnamed: 0,date,trans,symbol,qty,price
0,2006-01-05,BUY,RHAT,100.0,35.14


In [None]:
df.price += 1
df.spark.to_spark_io(
    format="jdbc", mode="append",
    dbtable="stocks", url="jdbc:sqlite:{}/example.db".format(os.getcwd()))
ps.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))

22/10/29 17:17:50 WARN JdbcUtils: Requested isolation level 1 is not supported; falling back to default isolation level 8


Unnamed: 0,date,trans,symbol,qty,price
0,2006-01-05,BUY,RHAT,100.0,35.14
1,2006-01-05,BUY,RHAT,100.0,36.14


In [None]:
spark.stop()