In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# conf = SparkConf().setAppName("NewApp").setMaster("local")
# sc= SparkContext(conf = conf)

In [3]:
#spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder.appName("NewApp").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/22 08:24:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/22 08:24:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
sc=spark.sparkContext

In [5]:
purdf = spark.read.format("csv").option("inferSchema",True).option("header",True).load("/home/labuser/Downloads/purchases.csv")

                                                                                

In [6]:
purdf.show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
|  June|     3|      0|
|Robert|     2|      3|
|  Lily|     0|      7|
| David|     1|      2|
+------+------+-------+



In [7]:
purdf = purdf.withColumnRenamed("_c0", "Name")
purdf.show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
|  June|     3|      0|
|Robert|     2|      3|
|  Lily|     0|      7|
| David|     1|      2|
+------+------+-------+



# Spark Partitions

## Write Operation

In [8]:
#getting the number of partitions fpr purdf dataframe
purdf.rdd.getNumPartitions()

1

In [9]:
#setting the number of partitions to 6 for the dataframe purdf
purdf = purdf.repartition(6)

In [10]:
purdf.write.format("csv").mode("overwrite").save("/home/labuser/Downloads/writeoperation/purch.csv")



As there are only 4 records in the dataframe shown above, a higher number of partitions will still result in a total of 4 partitions as there will be one record in each partition

## Spark UI

used to get a user friendly detail of the entire backend process when the Spark Engine runs

In [11]:
# to run the link directly from the python code
 
import webbrowser

url = sc.uiWebUrl

webbrowser.open_new_tab(url)

True

## Spark SQL

SQL can be used with Dataframes for all kind of transformations (select, filter, groupBy etc.)
DSL can also be used with DataFrames for all kinds of transformations (select, filter, groupBy etc.)
For RDDs, only lambda operations are used

In [12]:
#select operation
purdf.createOrReplaceTempView("pur")

df1 = spark.sql("select * from pur").show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
|Robert|     2|      3|
| David|     1|      2|
|  June|     3|      0|
|  Lily|     0|      7|
+------+------+-------+



In [13]:
#filter operation
df1 = spark.sql("select * from pur where apples>0").show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
| David|     1|      2|
|  June|     3|      0|
|Robert|     2|      3|
+------+------+-------+



In [14]:
#

## Spark DSL

In [15]:
#select operation
purdf.select('*').show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
|Robert|     2|      3|
| David|     1|      2|
|  June|     3|      0|
|  Lily|     0|      7|
+------+------+-------+



In [16]:
#filter operation
purdf.filter(col("apples")>0).show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
| David|     1|      2|
|  June|     3|      0|
|Robert|     2|      3|
+------+------+-------+



In [17]:
#select except few





In [18]:
#sort operation
purdf.sort(col("apples").desc()).show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
|  June|     3|      0|
|Robert|     2|      3|
| David|     1|      2|
|  Lily|     0|      7|
+------+------+-------+



In [19]:
#select multiple
purdf.select(col("apples"), col("Name")).show()

+------+------+
|apples|  Name|
+------+------+
|     2|Robert|
|     0|  Lily|
|     3|  June|
|     1| David|
+------+------+



## When and Otherwise in PySpark

There are two ways to provide conditional statements and get the output

-> First one is using Expressions: utilises the SQL code to achieve the ouput

-> Second one is using DSL when and otherwise concept.

-> DSL is more efficient than SQL and therefore is instead used.


In [20]:
purdf.withColumn("New", expr("CASE WHEN apples>=2 and oranges>=2 THEN 'Big Purchase' ELSE 'Small Purchase' END")).show()

+------+------+-------+--------------+
|  Name|apples|oranges|           New|
+------+------+-------+--------------+
|Robert|     2|      3|  Big Purchase|
| David|     1|      2|Small Purchase|
|  June|     3|      0|Small Purchase|
|  Lily|     0|      7|Small Purchase|
+------+------+-------+--------------+



In [21]:
purdf.select("apples", when(col("apples")>=2, "Big Purchase").otherwise("Small Purchase").alias("Purchase")).show()

+------+--------------+
|apples|      Purchase|
+------+--------------+
|     1|Small Purchase|
|     3|  Big Purchase|
|     0|Small Purchase|
|     2|  Big Purchase|
+------+--------------+



## User-defined Functions in PySpark

In [22]:
def concatShell(purCol):
    return purCol + "_Shell"

In [23]:
concatShell("test")

'test_Shell'

In [24]:
udf_shell = udf(concatShell, StringType())

In [26]:
purdf.show()

+------+------+-------+
|  Name|apples|oranges|
+------+------+-------+
|Robert|     2|      3|
| David|     1|      2|
|  June|     3|      0|
|  Lily|     0|      7|
+------+------+-------+



In [30]:
test = purdf.withColumn("new_col", udf_shell(col("Name")))
test.show()

23/09/22 08:38:44 ERROR Executor: Exception in task 0.0 in stage 40.0 (TID 75)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 683, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.8 than that in driver 3.11, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:561)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:514)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIte

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 683, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.8 than that in driver 3.11, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


In [None]:
purdf.printSchema()

## Caching in PySpark

In [29]:
purdf.cache()

23/09/22 08:30:05 WARN CacheManager: Asked to cache already cached data.


DataFrame[Name: string, apples: int, oranges: int]