In [1]:
from pyspark.sql import SparkSession

In [2]:
spark: SparkSession = SparkSession.builder.appName("ExampleApp").getOrCreate()

In [3]:
df = spark.read.parquet("data/2017_StPaul_MN_Real_Estate.parquet")

In [11]:
df = df.select(["StreetNumberNumeric", "FirePlaces", "LotSizeDimensions", "ListType", "Acres"])
df.show()

+-------------------+----------+--------------------+---------------+-----+
|StreetNumberNumeric|FirePlaces|   LotSizeDimensions|       ListType|Acres|
+-------------------+----------+--------------------+---------------+-----+
|              11511|         0|             279X200|Exclusive Right| 1.28|
|              11200|         0|             100x140|Exclusive Right| 0.32|
|               8583|         0|             120x296|Exclusive Right|0.822|
|               9350|         1|             208X208|Exclusive Right| 0.94|
|               2915|         1|             116x200|Exclusive Right|  0.0|
|               3604|         1|              50x150|Exclusive Right|0.172|
|               9957|         0|              common|Exclusive Right| 0.05|
|               9934|         0|              common|Exclusive Right| 0.05|
|               9926|         0|              common|Exclusive Right| 0.05|
|               9928|         0|              common|Exclusive Right| 0.05|
|           

In [12]:
cols_to_drop = ['STREETNUMBERNUMERIC', 'LOTSIZEDIMENSIONS']
df = df.drop(*cols_to_drop)

In [14]:
df.show(5)

+----------+---------------+-----+
|FirePlaces|       ListType|Acres|
+----------+---------------+-----+
|         0|Exclusive Right| 1.28|
|         0|Exclusive Right| 0.32|
|         0|Exclusive Right|0.822|
|         1|Exclusive Right| 0.94|
|         1|Exclusive Right|  0.0|
+----------+---------------+-----+
only showing top 5 rows



In [18]:
df = spark.read.parquet("data/2017_StPaul_MN_Real_Estate.parquet")
print(df.count())

5000


In [19]:
df.select(['ASSUMABLEMORTGAGE']).distinct().show()

+-------------------+
|  ASSUMABLEMORTGAGE|
+-------------------+
|  Yes w/ Qualifying|
| Information Coming|
|Yes w/No Qualifying|
|      Not Assumable|
|               NULL|
+-------------------+



In [20]:
yes_values = ['Yes w/ Qualifying', 'Yes w/No Qualifying']
text_filter = ~df['ASSUMABLEMORTGAGE'].isin(yes_values) | df['ASSUMABLEMORTGAGE'].isNull()
df = df.where(text_filter)
print(df.count())

4976


In [27]:
from pyspark.sql.functions import log
df = df.withColumn('log_SalesClosePrice', log(df['SalesClosePrice']))

In [33]:
from pyspark.sql.functions import mean, stddev


mean_val = df.agg({'log_SalesClosePrice': 'mean'}).collect()[0][0]
stddev_val = df.agg({'log_SalesClosePrice': 'stddev'}).collect()[0][0]

low_bound = mean_val - (3 * stddev_val)
hi_bound = mean_val + (3 * stddev_val)

# Filter the data to fit between the lower and upper bounds
df = df.where((df['log_SalesClosePrice'] < hi_bound) & (df['log_SalesClosePrice'] > low_bound))

In [34]:
print(df.count())

4946
