In [1]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps



### Set Spark Cluster

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("v3_sparkSession").master("spark://spark-master:7077") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/06 09:18:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Create pandas series...
psdf = ps.DataFrame({
    "year" : [1990,1997,2003,2009,2014],
    "rabbit" : [20,18,489,675,1776],
    "horse" : [4,25,281,600,1900]
})

pdf = pd.DataFrame({
    "year" : [1990,1997,2003,2009,2014],
    "sheep" : [22,50,121,445,791],
    "chicken": [250,326,589,1241,2118]
    })

In [4]:
temp_ppdf = pdf

In [5]:
print("psdf : ",type(psdf))

psdf :  <class 'pyspark.pandas.frame.DataFrame'>


### Apply SQL on Pandas on SparkAPI

In [6]:
temp_df = ps.sql(" SELECT * FROM {psdf} WHERE rabbit > 100", psdf=psdf)
temp_df

                                                                                

Unnamed: 0,year,rabbit,horse
0,2003,489,281
1,2009,675,600
2,2014,1776,1900


In [16]:
#temp_df.describe()

### Spark API and PandasOnSpark API Converstion

In [7]:
spark_df = temp_df.to_spark()
print("spark_df : ",type(spark_df))
spark_df.describe()

spark_df :  <class 'pyspark.sql.dataframe.DataFrame'>




DataFrame[summary: string, year: string, rabbit: string, horse: string]

In [8]:
print(spark_df)
spark_df.show()

DataFrame[year: bigint, rabbit: bigint, horse: bigint]
+----+------+-----+
|year|rabbit|horse|
+----+------+-----+
|2003|   489|  281|
|2009|   675|  600|
|2014|  1776| 1900|
+----+------+-----+



In [9]:
spark_df.head()

Row(year=2003, rabbit=489, horse=281)

In [10]:
pandas_on_spark =spark_df.to_pandas_on_spark()
print("pandas_on_spark : ",type(pandas_on_spark))

pandas_on_spark :  <class 'pyspark.pandas.frame.DataFrame'>




### Tracking Spark execution in detail

In [11]:
# check SQL filtered dataframe
temp_df.spark.explain()

== Physical Plan ==
*(2) Project [distributed_sequence_id#24L AS __index_level_0__#23L, year#1L, rabbit#2L, horse#3L]
+- AttachDistributedSequence[distributed_sequence_id#24L, year#1L, rabbit#2L, horse#3L] Index: distributed_sequence_id#24L
   +- *(1) Project [year#1L, rabbit#2L, horse#3L]
      +- *(1) Filter (rabbit#2L > 100)
         +- *(1) Scan ExistingRDD[__index_level_0__#0L,year#1L,rabbit#2L,horse#3L]




In [12]:
from pyspark.pandas import option_context

with option_context(
    "compute.ops_on_diff_frames",False,
    "compute.default_index_type","distributed"
):
    df = ps.range(10)
    df = df + df
    df.spark.explain()

== Physical Plan ==
*(1) Project [__index_level_0__#209L, (id#207L + id#207L) AS id#221L]
+- *(1) Project [distributed_index() AS __index_level_0__#209L, id#207L]
   +- *(1) Range (0, 10, step=1, splits=4)




In [13]:
df = ps.range(10)
df

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [14]:
with option_context(
    "compute.ops_on_diff_frames",False,
    "compute.default_index_type","distributed"
):
    df = ps.range(10)
    df = (df + df).spark.cache()
    df.spark.explain()


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- InMemoryTableScan [__index_level_0__#250L, id#262L]
      +- InMemoryRelation [__index_level_0__#250L, id#262L, __natural_order__#253L], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- *(1) Project [__index_level_0__#250L, (id#248L + id#248L) AS id#262L, __natural_order__#253L]
               +- *(1) Project [__index_level_0__#250L, id#248L, monotonically_increasing_id() AS __natural_order__#253L]
                  +- *(1) Project [distributed_index() AS __index_level_0__#250L, id#248L]
                     +- *(1) Range (0, 10, step=1, splits=4)




In [15]:
import pyspark
print(f"PySpark version: {pyspark.__version__}")

PySpark version: 3.5.1


In [16]:
df

Unnamed: 0,id
0,0
1,2
8589934592,4
8589934593,6
8589934594,8
17179869184,10
17179869185,12
25769803776,14
25769803777,16
25769803778,18


In [17]:
with (df + df ).spark.cache() as df : 
    new_df = df+df
    df.spark.explain()
    print(new_df)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- InMemoryTableScan [__index_level_0__#250L, id#402L]
      +- InMemoryRelation [__index_level_0__#250L, id#402L, __natural_order__#253L], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- AdaptiveSparkPlan isFinalPlan=false
               +- Project [__index_level_0__#250L, (id#262L + id#262L) AS id#402L, __natural_order__#253L]
                  +- InMemoryTableScan [__index_level_0__#250L, __natural_order__#253L, id#262L]
                        +- InMemoryRelation [__index_level_0__#250L, id#262L, __natural_order__#253L], StorageLevel(disk, memory, deserialized, 1 replicas)
                              +- *(1) Project [__index_level_0__#250L, (id#248L + id#248L) AS id#262L, __natural_order__#253L]
                                 +- *(1) Project [__index_level_0__#250L, id#248L, monotonically_increasing_id() AS __natural_order__#253L]
                                    +- *(1) Project [distributed_index() 

25/06/06 11:01:04 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/06/06 11:01:04 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce