In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")
#Because you are likely running in local mode, it is a good practice to set the number of shuffle partitions
# to something that is going to fit local mode. By default, the value is 200, but there aren't many executors
# on this machine, its worth reducing this to 5
config.set("spark.sql.shuffle.partitions", "5")
spark = SparkSession.builder.config(conf=config).master("local").appName("Analyzing Real Estate Sales").getOrCreate()

In [None]:
df = spark.read.format('csv').option("header", "true").load('../monthly_data.csv')

# Spark dataframe to pandas

In [None]:
# https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html
import numpy as np
import pandas as pd

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pandas_df = df.select("*").toPandas()
pandas_df

# Pandas to spark df

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled", "false")
# Not every conversion will go fine, especially if data types don't match up. See load_various_formats file
df = spark.createDataFrame(pandas_df)
df.show()

# Starting spark 3.3
 - Koalas code has been merged
 - Pandas-on-Spark is a new datastructure that is a distributed version of Pandas dataframe (so can use pandas syntax for most parts)

In [None]:
# import Pandas-on-Spark (If your Spark Context / Spark Session already exists, it will be picked up by default)
import pyspark.pandas as ps
df = spark.read.format('csv').option("header", "true").load('../monthly_data.csv')

# Create a DataFrame with Pandas-on-Spark
ps_df = ps.DataFrame(df)
# Convert a Pandas-on-Spark Dataframe into a Pandas Dataframe
pd_df = ps_df.to_pandas()
pd_df

In [None]:
# Convert a Pandas Dataframe into a Pandas-on-Spark Dataframe
ps_df = ps.from_pandas(pd_df)
ps_df

In [None]:
# spark.catalog.clearCache()
spark.stop()