# Data Analysis

In [None]:
# Below variables are to be set in the shell profile
# export SPARK_HOME=/Users/pmacharl/spark-2.4.4-bin-hadoop2.7
# export PATH=$PATH:$SPARK_HOME/bin
# export PYSPARK_SUBMIT_ARGS="pyspark-shell"
# export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3
# export PYSPARK_PYTHON=/usr/local/bin/python3

# Start cluster manually

In [None]:
# https://spark.apache.org/docs/latest/spark-standalone.html
# ./sbin/start-master.sh 
# Start your spark server by navigating to SPARK_HOME/sbin and executing ./start-all.sh
# By default web Spark UI serves on :8080 in cluster mode. See all options for setting host, ip etc. in documentation

In [4]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")
spark = SparkSession.builder.config(conf=config).master("spark://192.168.0.6:7077").appName("Analyzing Real Estate Sales").getOrCreate()

In [5]:
df = spark.read.format('csv').option("header", "true").load('../Real_Estate_Sales_2001-2017.csv')

In [6]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- SerialNumber: string (nullable = true)
 |-- ListYear: string (nullable = true)
 |-- DateRecorded: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- AssessedValue: string (nullable = true)
 |-- SaleAmount: string (nullable = true)
 |-- SalesRatio: string (nullable = true)
 |-- PropertyType: string (nullable = true)
 |-- ResidentialType: string (nullable = true)
 |-- NonUseCode: string (nullable = true)
 |-- Remarks: string (nullable = true)



# Get total counts of properties transacted by ListYear

In [7]:
property_by_year = df.groupBy("ListYear").agg({"ID":"count"})
property_by_year.show()

+--------+---------+
|ListYear|count(ID)|
+--------+---------+
|    2016|    49773|
|    2012|    35973|
|    2017|    45692|
|    2014|    49563|
|    2013|    39943|
|    2005|    61602|
|    2002|   106068|
|    2009|    42508|
|    2006|    48775|
|    2004|    84056|
|    2011|    31065|
|    2008|    32734|
|    2007|    35616|
|    2015|    46651|
|    2001|    59584|
|    2010|    27755|
|    2003|    64239|
+--------+---------+



# How much money got transacted by PropertyType

In [16]:
df.groupBy("PropertyType").agg({"SaleAmount":"sum"}).show()

+--------------+------------------+
|  PropertyType|   sum(SaleAmount)|
+--------------+------------------+
|    Apartments|     6.679886858E9|
|   Vacant Land|     8.648986096E9|
|            NA|     2.237281744E9|
|   Residential|2.0461496626929E11|
|          null|   1.7968785803E10|
|    Industrial|     3.698454639E9|
|         Condo|    2.530141781E10|
|Public Utility|       2.9632347E7|
|10 Mill Forest|         1830160.0|
|    Commercial|   3.0602284248E10|
+--------------+------------------+

0:00:01.052141


# Rename column

In [18]:
df.groupBy("PropertyType").agg({"SaleAmount":"sum"}).withColumnRenamed("sum(SaleAmount)","TotalMoneyTransacted").show()

+--------------+--------------------+
|  PropertyType|TotalMoneyTransacted|
+--------------+--------------------+
|    Apartments|       6.679886858E9|
|   Vacant Land|       8.648986096E9|
|            NA|       2.237281744E9|
|   Residential|  2.0461496626929E11|
|          null|     1.7968785803E10|
|    Industrial|       3.698454639E9|
|         Condo|      2.530141781E10|
|Public Utility|         2.9632347E7|
|10 Mill Forest|           1830160.0|
|    Commercial|     3.0602284248E10|
+--------------+--------------------+



# Time your code to check efficiency

In [20]:
# %%timeit Use this magic function if you want mean execution time, but beware it runs the code multiple times
from datetime import datetime
start = datetime.now()

df.groupBy("PropertyType").agg({"SaleAmount":"sum"}).show()

print("{0}".format(datetime.now() - start))

+--------------+------------------+
|  PropertyType|   sum(SaleAmount)|
+--------------+------------------+
|    Apartments|     6.679886858E9|
|   Vacant Land|     8.648986096E9|
|            NA|     2.237281744E9|
|   Residential|2.0461496626929E11|
|          null|   1.7968785803E10|
|    Industrial|     3.698454639E9|
|         Condo|    2.530141781E10|
|Public Utility|       2.9632347E7|
|10 Mill Forest|         1830160.0|
|    Commercial|   3.0602284248E10|
+--------------+------------------+

+--------------+------------------+
|  PropertyType|   sum(SaleAmount)|
+--------------+------------------+
|    Apartments|     6.679886858E9|
|   Vacant Land|     8.648986096E9|
|            NA|     2.237281744E9|
|   Residential|2.0461496626929E11|
|          null|   1.7968785803E10|
|    Industrial|     3.698454639E9|
|         Condo|    2.530141781E10|
|Public Utility|       2.9632347E7|
|10 Mill Forest|         1830160.0|
|    Commercial|   3.0602284248E10|
+--------------+-----------

# Remove spark application

In [21]:
spark.stop()