In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")
#Because you are likely running in local mode, it is a good practice to set the number of shuffle partitions
# to something that is going to fit local mode. By default, the value is 200, but there aren't many executors
# on this machine, its worth reducing this to 5
config.set("spark.sql.shuffle.partitions", "5")
spark = SparkSession.builder.config(conf=config).master("local").appName("Analyzing Real Estate Sales").getOrCreate()

In [2]:
df = spark.read.format('csv').option("header", "true").load('../monthly_data.csv')

# Spark dataframe to pandas

In [3]:
# https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html
import numpy as np
import pandas as pd

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pandas_df = df.select("*").toPandas()
pandas_df

Unnamed: 0,YYYY,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,YEAR
0,2008,10140,10239,10050,10111,10159,10159,10141,10117,10178,10148,10125,10182,10146
1,2009,10137,10140,10140,10141,10188,10168,10128,10165,10208,10166,10041,10068,10141
2,2010,10151,10034,10168,10194,10158,10166,10158,10129,10147,10135,10057,10133,10136
3,2011,10182,10161,10227,10192,10182,10154,10123,10130,10149,10182,10194,10099,10165
4,2012,10194,10286,10271,10053,10159,10127,10139,10155,10149,10109,10108,10085,10153
5,2013,10142,10169,10099,10155,10113,10180,10201,10176,10151,10129,10155,10170,10153
6,2014,10055,10031,10164,10148,10154,10184,10143,10117,10189,10142,10103,10172,10134
7,2015,10135,10164,10198,10214,10152,10195,10142,10152,10171,10186,10150,10217,10173
8,2016,10100,10099,10144,10122,10140,10137,10168,10183,10177,10214,10144,10283,10159
9,2017,10228,10151,10154,10211,10170,10134,10141,10162,10135,10176,10141,10120,10160


# Pandas to spark df

In [4]:
spark.conf.set("spark.sql.execution.arrow.enabled", "false")
# Not every conversion will go fine, especially if data types don't match up. See load_various_formats file
df = spark.createDataFrame(pandas_df)
df.show()

+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|YYYY|  JAN|  FEB|  MAR|  APR|  MAY|  JUN|  JUL|  AUG|  SEP|  OCT|  NOV|  DEC| YEAR|
+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|2008|10140|10239|10050|10111|10159|10159|10141|10117|10178|10148|10125|10182|10146|
|2009|10137|10140|10140|10141|10188|10168|10128|10165|10208|10166|10041|10068|10141|
|2010|10151|10034|10168|10194|10158|10166|10158|10129|10147|10135|10057|10133|10136|
|2011|10182|10161|10227|10192|10182|10154|10123|10130|10149|10182|10194|10099|10165|
|2012|10194|10286|10271|10053|10159|10127|10139|10155|10149|10109|10108|10085|10153|
|2013|10142|10169|10099|10155|10113|10180|10201|10176|10151|10129|10155|10170|10153|
|2014|10055|10031|10164|10148|10154|10184|10143|10117|10189|10142|10103|10172|10134|
|2015|10135|10164|10198|10214|10152|10195|10142|10152|10171|10186|10150|10217|10173|
|2016|10100|10099|10144|10122|10140|10137|10168|10183|10177|10214

# Starting spark 3.3
 - Koalas code has been merged
 - Pandas-on-Spark is a new datastructure that is a distributed version of Pandas dataframe (so can use pandas syntax for most parts)

In [6]:
# import Pandas-on-Spark (If your Spark Context / Spark Session already exists, it will be picked up by default)
import pyspark.pandas as ps
df = spark.read.format('csv').option("header", "true").load('../monthly_data.csv')

# Create a DataFrame with Pandas-on-Spark
ps_df = ps.DataFrame(df)
# Convert a Pandas-on-Spark Dataframe into a Pandas Dataframe
pd_df = ps_df.to_pandas()
pd_df



Unnamed: 0,YYYY,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,YEAR
0,2008,10140,10239,10050,10111,10159,10159,10141,10117,10178,10148,10125,10182,10146
1,2009,10137,10140,10140,10141,10188,10168,10128,10165,10208,10166,10041,10068,10141
2,2010,10151,10034,10168,10194,10158,10166,10158,10129,10147,10135,10057,10133,10136
3,2011,10182,10161,10227,10192,10182,10154,10123,10130,10149,10182,10194,10099,10165
4,2012,10194,10286,10271,10053,10159,10127,10139,10155,10149,10109,10108,10085,10153
5,2013,10142,10169,10099,10155,10113,10180,10201,10176,10151,10129,10155,10170,10153
6,2014,10055,10031,10164,10148,10154,10184,10143,10117,10189,10142,10103,10172,10134
7,2015,10135,10164,10198,10214,10152,10195,10142,10152,10171,10186,10150,10217,10173
8,2016,10100,10099,10144,10122,10140,10137,10168,10183,10177,10214,10144,10283,10159
9,2017,10228,10151,10154,10211,10170,10134,10141,10162,10135,10176,10141,10120,10160


In [7]:
# Convert a Pandas Dataframe into a Pandas-on-Spark Dataframe
ps_df = ps.from_pandas(pd_df)
ps_df

Unnamed: 0,YYYY,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,YEAR
0,2008,10140,10239,10050,10111,10159,10159,10141,10117,10178,10148,10125,10182,10146
1,2009,10137,10140,10140,10141,10188,10168,10128,10165,10208,10166,10041,10068,10141
2,2010,10151,10034,10168,10194,10158,10166,10158,10129,10147,10135,10057,10133,10136
3,2011,10182,10161,10227,10192,10182,10154,10123,10130,10149,10182,10194,10099,10165
4,2012,10194,10286,10271,10053,10159,10127,10139,10155,10149,10109,10108,10085,10153
5,2013,10142,10169,10099,10155,10113,10180,10201,10176,10151,10129,10155,10170,10153
6,2014,10055,10031,10164,10148,10154,10184,10143,10117,10189,10142,10103,10172,10134
7,2015,10135,10164,10198,10214,10152,10195,10142,10152,10171,10186,10150,10217,10173
8,2016,10100,10099,10144,10122,10140,10137,10168,10183,10177,10214,10144,10283,10159
9,2017,10228,10151,10154,10211,10170,10134,10141,10162,10135,10176,10141,10120,10160


In [5]:
# spark.catalog.clearCache()
spark.stop()