In [0]:
import pyspark.pandas as ps
import pandas as pd

In [0]:
psdf = ps.DataFrame({
    "year" : [1990,1997,2003,2009,2014],
    "rabbit" : [20,18,489,675,1776],
    "horse" : [4,25,281,600,1900]
})

pdf = pd.DataFrame({
    "year" : [1990,1997,2003,2009,2014],
    "sheep" : [22,50,121,445,791],
    "chicken": [250,326,589,1241,2118]
    })

In [0]:
psdf

In [0]:
ps.sql(" SELECT * FROM {psdf} WHERE rabbit > 100", psdf=psdf)

In [0]:
ps.sql("""
SELECT ps.rabbit, pd.chicken
       FROM {psdf} ps INNER JOIN {pdf} pd
       ON ps.year = pd.year
       ORDER BY ps.rabbit, pd.chicken

""",psdf=psdf,pdf=pdf)

### Pandas API on Spark

In [0]:
psdf = ps.DataFrame({
    "A" : [1,2,3,4,5],
    "B" : [10,20,30,40,50]
})
print(type(psdf))
psdf.head()

In [0]:
# Converting pandas-on-spark DataFrame to Spark DataFrame
sdf = psdf.to_spark()
print(type(sdf))
sdf.show()

In [0]:
psdf_2 = sdf.to_pandas_on_spark()
print(type(psdf_2))
psdf_2.head()

In [0]:
psdf_3 = sdf.pandas_api()
print(type(psdf_3))
psdf_3

### Checking Spark Execution Plans

In [0]:
from pyspark.pandas import option_context

with option_context(
    "compute.ops_on_diff_frames",True,
    "compute.default_index_type","distributed"
):
    df = ps.range(10) + ps.range(10)
    df.spark.explain()

In [0]:
with option_context(
    "compute.ops_on_diff_frames",False,
    "compute.default_index_type","distributed"
):
    df = ps.range(10)
    df = df + df
    df.spark.explain()

###  Caching DataFrames

In [0]:
with option_context(
    "compute.default_index_type","distributed"
):
    df = ps.range(10)
    new_df = (df + df ).spark.cache()
    new_df.spark.explain()
    print(new_df)

In [0]:
df = ps.range(10)
new_df = (df + df ).spark.cache()
new_df.spark.explain()
print(new_df)

In [0]:
# uncache the memory
new_df.spark.unpersist()

In [0]:
with (df + df ).spark.cache() as df : 
    df.spark.explain()