# Start Spark Cluster
- From `$SPARK_HOME/bin` execute `./sbin/start-all.sh`. More [options](https://spark.apache.org/docs/latest/spark-standalone.html) for passing parameters

In [1]:
# The whole folder for apache-spark is downloaded to site-packages folder , if you are interested to know 
! pip install pyspark==3.0.0

You should consider upgrading via the '/Users/pmacharl/git-projects/personal/github.com/data_analysis_pandas_spark_koalas/venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# Below variables are to be set in the shell profile
# export SPARK_HOME=/Users/pmacharl/spark-3.0.0-bin-hadoop2.7
# export PATH=$PATH:$SPARK_HOME/bin
# export PYSPARK_SUBMIT_ARGS="pyspark-shell"
# export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3
# export PYSPARK_PYTHON=/usr/local/bin/python3

In [3]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [4]:
# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")

#Because you are likely running in local mode, it is a good practice to set the number of shuffle partitions
# to something that is going to fit local mode. By default, the value is 200, but there aren't many executors
# on this machine, its worth reducing this to 5
config.set("spark.sql.shuffle.partitions", "5")

# Cluster mode
# https://spark.apache.org/docs/latest/submitting-applications.html
config.setMaster("spark://192.168.0.4:7077") # If spark is started in local cluster mode

<pyspark.conf.SparkConf at 0x107174910>

In [5]:
# https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession
# spark = SparkSession.builder.config(conf=config).master("local").appName("Analyzing Real Estate Sales").getOrCreate()

# Cluster mode
spark = SparkSession.builder.config(conf=config).master("spark://192.168.0.4:7077").appName("Analyzing Real Estate Sales").getOrCreate()
# spark = SparkSession.builder.config(conf=config).appName("Analyzing Real Estate Sales").getOrCreate()

In [6]:
spark

In [7]:
df = spark.read.format('csv').option("header", "true").load('../Real_Estate_Sales_2001-2017.csv')

In [8]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- SerialNumber: string (nullable = true)
 |-- ListYear: string (nullable = true)
 |-- DateRecorded: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- AssessedValue: string (nullable = true)
 |-- SaleAmount: string (nullable = true)
 |-- SalesRatio: string (nullable = true)
 |-- PropertyType: string (nullable = true)
 |-- ResidentialType: string (nullable = true)
 |-- NonUseCode: string (nullable = true)
 |-- Remarks: string (nullable = true)



In [None]:
df.describe().show()

In [None]:
df.select("Town").describe().show()

In [None]:
df.columns

In [None]:
df.explain()

In [None]:
df.count()

In [None]:
df.show()

In [None]:
df.head(5) # OR df.take(5)

In [None]:
df.select("Town").show()

In [None]:
df.limit(10).show()

In [None]:
df.dropna()

In [None]:
df1 = df.drop("ID")
df1.show()

In [None]:
distinct_property_type = df1.select("PropertyType").distinct()
distinct_property_type.show()

In [None]:
distinct_property_type.count()

In [None]:
df_andover_town = df.filter(df['Town']=='Andover')

In [None]:
df_andover_town.show(30)

In [None]:
# The file is saved with a name part_00_xx.snappy.parquet
# https://spark.apache.org/docs/latest/sql-data-sources-parquet.html
# df.write.parquet("blah.parquet")
df.write.save("blahfolder",format="parquet")

In [None]:
# spark.catalog.clearCache()
spark.stop()