In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("spark-codes").getOrCreate()

In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
sc

## Quick start

In [7]:
df1 = spark.range(100)
df1.show(5)

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



In [20]:
df1.count()

100

In [21]:
df1.printSchema()

root
 |-- id: long (nullable = false)



## SparkConf

In [14]:
conf = sc.getConf()

In [15]:
conf.getAll()

[('spark.driver.host', '192.168.0.114'),
 ('spark.driver.extraClassPath',
  '/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/jars/sqlite-jdbc-3.27.2.1.jar'),
 ('spark.app.name', 'spark-codes'),
 ('spark.executor.id', 'driver'),
 ('spark.sql.warehouse.dir', '/tmp/hive/spark-warehouse'),
 ('spark.executor.extraClassPath',
  '/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/jars/sqlite-jdbc-3.27.2.1.jar'),
 ('spark.driver.port', '46013'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1622079209929'),
 ('spark.ui.showConsoleProgress', 'true')]

In [16]:
conf.get("spark.sql.shuffle.partitions")

In [18]:
conf.set("spark.executor.memory", "2g")

<pyspark.conf.SparkConf at 0x7fea12235d30>

In [19]:
conf.get("spark.executor.memory")

'2g'

## Create DataFrame

In [26]:
list_df = spark.createDataFrame(list(range(10)), IntegerType()).toDF("num")
list_df.show(5)

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



In [27]:
from pyspark.sql import Row

In [28]:
# Create the Employees
Employee = Row("name") # Define the Row `Employee' with one column/key
employee1 = Employee('Bob') # Define against the Row 'Employee'
employee2 = Employee('Sam') # Define against the Row 'Employee'

# Create the Departments
Department = Row("name", "department") # Define the Row `Department' with two columns/keys
department1 = Department('Bob', 'Accounts') # Define against the Row 'Department'
department2 = Department('Alice', 'Sales') # Define against the Row 'Department'
department3 = Department('Sam', 'HR') # Define against the Row 'Department'

# Create DataFrames from rows
employeeDF = spark.createDataFrame([employee1, employee2])
departmentDF = spark.createDataFrame([department1, department2, department3])

# Join employeeDF to departmentDF on "name"
emp_dept_df = employeeDF.join(departmentDF, "name")

In [29]:
employeeDF.show()

+----+
|name|
+----+
| Bob|
| Sam|
+----+



In [30]:
departmentDF.show()

+-----+----------+
| name|department|
+-----+----------+
|  Bob|  Accounts|
|Alice|     Sales|
|  Sam|        HR|
+-----+----------+



In [31]:
emp_dept_df.show()

+----+----------+
|name|department|
+----+----------+
| Bob|  Accounts|
| Sam|        HR|
+----+----------+



In [32]:
# Create DataFrame from Row, with Schema specified

schema = StructType([
  StructField("letter", StringType(), True),
  StructField("position", IntegerType(), True)])

df = spark.createDataFrame([('A', 0),('B', 1),('C', 2)], schema)
df.show()

+------+--------+
|letter|position|
+------+--------+
|     A|       0|
|     B|       1|
|     C|       2|
+------+--------+



In [33]:
# Create Example Data - Departments and Employees

# Create the Departments
Department = Row("id", "name")
department1 = Department('123456', 'Computer Science')
department2 = Department('789012', 'Mechanical Engineering')
department3 = Department('345678', 'Theater and Drama')
department4 = Department('901234', 'Indoor Recreation')
department5 = Department('000000', 'All Students')

# Create the Employees
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)
employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)
employee5 = Employee('michael', 'jackson', 'no-reply@neverla.nd', 80000)

# Create the DepartmentWithEmployees instances from Departments and Employees
DepartmentWithEmployees = Row("department", "employees")
departmentWithEmployees1 = DepartmentWithEmployees(department1, [employee1, employee2])
departmentWithEmployees2 = DepartmentWithEmployees(department2, [employee3, employee4])
departmentWithEmployees3 = DepartmentWithEmployees(department3, [employee5, employee4])
departmentWithEmployees4 = DepartmentWithEmployees(department4, [employee2, employee3])
departmentWithEmployees5 = DepartmentWithEmployees(department5, [employee1, employee2, employee3, employee4, employee5])


In [34]:
print(department1)
print(employee2)
print(departmentWithEmployees1.employees[0].email)

Row(id='123456', name='Computer Science')
Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)
no-reply@berkeley.edu


In [36]:
departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2, departmentWithEmployees3, departmentWithEmployees4, departmentWithEmployees5]
df1 = spark.createDataFrame(departmentsWithEmployeesSeq1)
df1.show(truncate=False, vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 department | [123456, Computer Science]                                                                                                                                                                                                                 
 employees  | [[michael, armbrust, no-reply@berkeley.edu, 100000], [xiangrui, meng, no-reply@stanford.edu, 120000]]                                                                                                                                      
-RECORD 1------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


## write DataFrame

In [38]:
emp_dept_df.show()
emp_dept_df.write.mode("overwrite").format("json").save("emp_dept.json")

+----+----------+
|name|department|
+----+----------+
| Bob|  Accounts|
| Sam|        HR|
+----+----------+



In [39]:
!ls emp_dept.json

part-00000-1f64fa43-dab8-46ba-b11e-37c3d3b95f53-c000.json
part-00093-1f64fa43-dab8-46ba-b11e-37c3d3b95f53-c000.json
part-00109-1f64fa43-dab8-46ba-b11e-37c3d3b95f53-c000.json
_SUCCESS


In [41]:
!cat emp_dept.json/part-00093-1f64fa43-dab8-46ba-b11e-37c3d3b95f53-c000.json

{"name":"Bob","department":"Accounts"}


In [None]:
# use coalesce() to write a single file

(
    emp_dept_df
    .coalesce(1)
    .write.mode("overwrite")
    .format("json")
    .save("emp_dept1")
)

In [44]:
!ls emp_dept1

part-00000-5d1deace-f5d2-4b86-a26d-a5efe03b7cc2-c000.json  _SUCCESS


In [46]:
!cat emp_dept1/part-00000-5d1deace-f5d2-4b86-a26d-a5efe03b7cc2-c000.json

{"name":"Bob","department":"Accounts"}
{"name":"Sam","department":"HR"}


In [49]:
# default format - parquet

(
    emp_dept_df
    .coalesce(1)
    .write.mode("overwrite")
    .save("emp_dept2")
)

In [50]:
!ls emp_dept2

part-00000-984b261c-5972-4b91-9563-5d3eb084f068-c000.snappy.parquet  _SUCCESS


## Read DataFrame

In [47]:
df2 = spark.read.json("emp_dept1")

In [48]:
df2.show()

+----------+----+
|department|name|
+----------+----+
|  Accounts| Bob|
|        HR| Sam|
+----------+----+



In [51]:
df3 = spark.read.load("emp_dept2")
df3.show()

+----+----------+
|name|department|
+----+----------+
| Bob|  Accounts|
| Sam|        HR|
+----+----------+



In [74]:
!ls

apache-spark-resources.xlsx	      person.json
emp_dept1			      pyspark-cheat-sheet.ipynb
emp_dept2			      pyspark-tmp.ipynb
emp_dept.json			      spark-cert.README.md
LearningApacheSpark-CheatSheet.ipynb  spark-images
person-csv			      spark-in-codes.ipynb
person.csv			      spark-in-pictures.ipynb


In [79]:
# read text file
txt_df = (
   spark
    .read
    .text("spark-cert.README.md")
    .toDF("text")
)

In [80]:
txt_df.count()

82

In [81]:
txt_df.collect()[:5]

[Row(text='2021 Goal - Learn Graph and Spark'),
 Row(text=''),
 Row(text='# Graph '),
 Row(text=''),
 Row(text='## Graph Theory YouTube by William Fiset')]

In [91]:
csv_df = txt_df.withColumn("line_id", F.monotonically_increasing_id())
csv_df = csv_df.select("line_id", "text").limit(10)
csv_df.show(3, truncate=False)

+-------+---------------------------------+
|line_id|text                             |
+-------+---------------------------------+
|0      |2021 Goal - Learn Graph and Spark|
|1      |                                 |
|2      |# Graph                          |
+-------+---------------------------------+
only showing top 3 rows



In [92]:
(
    csv_df
    .coalesce(1)
    .write.mode("overwrite")
    .option("header", "true")
    .option("delimiter", "\t")
    .csv("csv")
)

In [105]:
x = !ls csv
x

['part-00000-7f0d72bb-3a1a-407e-b3b1-0808c309512d-c000.csv', '_SUCCESS']

In [106]:
with open(f"csv/{x[0]}") as f:
    print(f.read())

line_id	text
0	2021 Goal - Learn Graph and Spark
1	""
2	# Graph
3	""
4	## Graph Theory YouTube by William Fiset
5	- see folder Graph-Algorithms
6	""
7	## study book - Graph Algorithms
8	- /home/wengong/spark/databrick-cert/books/Graph-Algorithms/Neo4j_Graph_Algorithms_r3.pdf
9	- /home/wengong/projects/graph/graph-algo/Neo4j_Graph_Algorithms_r3.pdf



In [107]:
csv_df2 = (
    spark.read
    .option("header", "true")
    .option("delimiter", "\t")
    .option("inferSchema", "true")
    .csv("csv")
)

In [108]:
csv_df2.show(4, truncate=False)

+-------+---------------------------------+
|line_id|text                             |
+-------+---------------------------------+
|0      |2021 Goal - Learn Graph and Spark|
|1      |null                             |
|2      |# Graph                          |
|3      |null                             |
+-------+---------------------------------+
only showing top 4 rows



## UDF
### dataframe

In [52]:
def square(s):
    return s*s

In [53]:
square(10)

100

In [54]:
squaredWithPython = F.udf(square, LongType())

In [55]:
df = spark.range(1,10,3).toDF("num")
df.show()

+---+
|num|
+---+
|  1|
|  4|
|  7|
+---+



In [57]:
df.select("num", squaredWithPython("num").alias("num*num")).show()

+---+-------+
|num|num*num|
+---+-------+
|  1|      1|
|  4|     16|
|  7|     49|
+---+-------+



### register to SQL

In [58]:
df.createOrReplaceTempView("num_tab")

In [60]:
spark.sql("select * from num_tab").show()

+---+
|num|
+---+
|  1|
|  4|
|  7|
+---+



In [61]:
spark.udf.register("squaredWithPython", square, LongType())

<function __main__.square(s)>

In [72]:
spark.sql("""select num, squaredWithPython(num) as num_sq from num_tab""").show()

+---+------+
|num|num_sq|
+---+------+
|  1|     1|
|  4|    16|
|  7|    49|
+---+------+



In [73]:
tab_df = spark.table("num_tab")
tab_df.show()

+---+
|num|
+---+
|  1|
|  4|
|  7|
+---+



### Resources

- [7 steps to Spark](https://pages.databricks.com/rs/094-YMS-629/images/7-steps-for-a-developer-to-learn-apache-spark.pdf?mkt_tok=eyJpIjoiWWprMFpHVXlNV0ppWkRsaiIsInQiOiJ1cVBXVm1hbDE5VFlVbWdyMENveDRSYWJ1WHozUDFEVDBCanFzVzZMWTdkSUxVZlp2RElkQWlWaFZjeEE2ZDU3bU5kMk0wbUVSbVFOVG43K2RSN0JPcUVxbm52UzAwMFRWSStoWXViZGpXTmFcL2R3SGpEUnFQSEU4OThBK3V5ZnAifQ%3D%3D)
- [Spark Certification Study Guide - Part 1 (Core)](https://www.rakirahman.me/spark-certification-study-guide-part-1/)
- [Spark Certification Study Guide - Part 2 (Code)](https://www.rakirahman.me/spark-certification-study-guide-part-2)
- [Spark in pictures](https://github.com/wgong/py4kids/blob/master/lesson-17-pyspark/databrick/spark-in-pictures.ipynb)
- [Spark study notes](https://docs.google.com/spreadsheets/d/1AuvsTCTxzx1wBMkG-rvfwNpuoQoG9ISuXqQZwj-GGgQ/edit#gid=257341626)