In [1]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps



### Set Spark Cluster

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("v3_sparkSession").master("spark://spark-master:7077") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/03 03:44:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/03 03:44:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/03 03:44:51 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
# Create pandas series...
psdf = ps.DataFrame({
    "year" : [1990,1997,2003,2009,2014],
    "rabbit" : [20,18,489,675,1776],
    "horse" : [4,25,281,600,1900]
})

pdf = pd.DataFrame({
    "year" : [1990,1997,2003,2009,2014],
    "sheep" : [22,50,121,445,791],
    "chicken": [250,326,589,1241,2118]
    })

In [20]:
temp_ppdf = pdf

In [8]:
print("psdf : ",type(psdf))

psdf :  <class 'pyspark.pandas.frame.DataFrame'>


### Apply SQL on Pandas on SparkAPI

In [9]:
temp_df = ps.sql(" SELECT * FROM {psdf} WHERE rabbit > 100", psdf=psdf)
temp_df

                                                                                

Unnamed: 0,year,rabbit,horse
0,2003,489,281
1,2009,675,600
2,2014,1776,1900


In [16]:
#temp_df.describe()

### Spark API and PandasOnSpark API Converstion

In [11]:
spark_df = temp_df.to_spark()
print("spark_df : ",type(spark_df))
spark_df.describe()

spark_df :  <class 'pyspark.sql.dataframe.DataFrame'>




DataFrame[summary: string, year: string, rabbit: string, horse: string]

In [12]:
print(spark_df)
spark_df.show()

DataFrame[year: bigint, rabbit: bigint, horse: bigint]
+----+------+-----+
|year|rabbit|horse|
+----+------+-----+
|2003|   489|  281|
|2009|   675|  600|
|2014|  1776| 1900|
+----+------+-----+



In [13]:
spark_df.head()

Row(year=2003, rabbit=489, horse=281)

In [14]:
pandas_on_spark =spark_df.to_pandas_on_spark()
print("pandas_on_spark : ",type(pandas_on_spark))

pandas_on_spark :  <class 'pyspark.pandas.frame.DataFrame'>




### Tracking Spark execution in detail

In [15]:
# check SQL filtered dataframe
temp_df.spark.explain()

== Physical Plan ==
*(2) Project [distributed_sequence_id#596L AS __index_level_0__#595L, year#1L, rabbit#2L, horse#3L]
+- AttachDistributedSequence[distributed_sequence_id#596L, year#1L, rabbit#2L, horse#3L] Index: distributed_sequence_id#596L
   +- *(1) Project [year#1L, rabbit#2L, horse#3L]
      +- *(1) Filter (rabbit#2L > 100)
         +- *(1) Scan ExistingRDD[__index_level_0__#0L,year#1L,rabbit#2L,horse#3L]




In [17]:
from pyspark.pandas import option_context

with option_context(
    "compute.ops_on_diff_frames",False,
    "compute.default_index_type","distributed"
):
    df = ps.range(10)
    df = df + df
    df.spark.explain()

== Physical Plan ==
*(1) Project [__index_level_0__#1261L, (id#1259L + id#1259L) AS id#1273L]
+- *(1) Project [distributed_index() AS __index_level_0__#1261L, id#1259L]
   +- *(1) Range (0, 10, step=1, splits=4)




In [19]:
df = ps.range(10)
df

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [64]:
with option_context(
    "compute.ops_on_diff_frames",False,
    "compute.default_index_type","distributed"
):
    df = ps.range(10)
    df = (df + df).spark.cache()
    df.spark.explain()
    print(df)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- InMemoryTableScan [__index_level_0__#2982L, id#2994L]
      +- InMemoryRelation [__index_level_0__#2982L, id#2994L, __natural_order__#2985L], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- *(1) Project [__index_level_0__#1721L, (id#1719L + id#1719L) AS id#1733L, __natural_order__#1724L]
               +- *(1) Project [__index_level_0__#1721L, id#1719L, monotonically_increasing_id() AS __natural_order__#1724L]
                  +- *(1) Project [distributed_index() AS __index_level_0__#1721L, id#1719L]
                     +- *(1) Range (0, 10, step=1, splits=4)


             id
0             0
1             2
8589934592    4
8589934593    6
8589934594    8
17179869184  10
17179869185  12
25769803776  14
25769803777  16
25769803778  18


24/10/31 14:07:38 WARN CacheManager: Asked to cache already cached data.


In [65]:
with (df + df ).spark.cache() as df : 
    new_df = df+df
    df.spark.explain()
    print(new_df)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- InMemoryTableScan [__index_level_0__#2982L, id#3119L]
      +- InMemoryRelation [__index_level_0__#2982L, id#3119L, __natural_order__#2985L], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- AdaptiveSparkPlan isFinalPlan=false
               +- Project [__index_level_0__#2982L, (id#2994L + id#2994L) AS id#3119L, __natural_order__#2985L]
                  +- InMemoryTableScan [__index_level_0__#2982L, __natural_order__#2985L, id#2994L]
                        +- InMemoryRelation [__index_level_0__#2982L, id#2994L, __natural_order__#2985L], StorageLevel(disk, memory, deserialized, 1 replicas)
                              +- *(1) Project [__index_level_0__#1721L, (id#1719L + id#1719L) AS id#1733L, __natural_order__#1724L]
                                 +- *(1) Project [__index_level_0__#1721L, id#1719L, monotonically_increasing_id() AS __natural_order__#1724L]
                                    +- *(1) Proje