# The full example is in Spark)

# Get the data

In [60]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StructType, StructField, FloatType, StringType

spark = SparkSession \
    .builder \
    .appName("Housing-PySpark") \
    .getOrCreate()

customSchema = StructType([ \
    StructField("longitude", FloatType(), True), \
    StructField("latitude", FloatType(), True), \
    StructField("housing_median_age", FloatType(), True), \
    StructField("total_rooms", FloatType(), True), \
    StructField("total_bedrooms", FloatType(), True), \
    StructField("population", FloatType(), True), \
    StructField("households", FloatType(), True), \
    StructField("median_income", FloatType(), True),
    StructField("median_house_value", FloatType(), True), \
    StructField("ocean_proximity", StringType(), True) \
    ])

df = spark.read.csv("../datasets/housing", schema=customSchema, header=True, sep=",")
housing = df.filter(df["ocean_proximity"].isNotNull())
housing.count()

20640

In [67]:
housing.describe().show()

+-------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|summary|          longitude|         latitude|housing_median_age|       total_rooms|    total_bedrooms|        population|       households|     median_income|median_house_value|ocean_proximity|
+-------+-------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+---------------+
|  count|              20640|            20640|             20640|             20640|             20433|             20640|            20640|             20640|             20640|          20640|
|   mean|-119.56970444871473|35.63186143109965|28.639486434108527|2635.7630813953488| 537.8705525375618|1425.4767441860465|499.5396802325581|3.8706710030346416|206855.81690891474|           null|
| stddev|  2.0035317

In [62]:
housing.printSchema()

root
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- housing_median_age: float (nullable = true)
 |-- total_rooms: float (nullable = true)
 |-- total_bedrooms: float (nullable = true)
 |-- population: float (nullable = true)
 |-- households: float (nullable = true)
 |-- median_income: float (nullable = true)
 |-- median_house_value: float (nullable = true)
 |-- ocean_proximity: string (nullable = true)



In [63]:
from pyspark.sql.functions import *

housing.groupBy("ocean_proximity").count().orderBy(desc('count')).show()

+---------------+-----+
|ocean_proximity|count|
+---------------+-----+
|      <1H OCEAN| 9136|
|         INLAND| 6551|
|     NEAR OCEAN| 2658|
|       NEAR BAY| 2290|
|         ISLAND|    5|
+---------------+-----+



In [71]:
random_train_df, random_test_df = housing.randomSplit([0.8, 0.2], seed=42)
print("Random Selection: %d, %d" %(random_train_df.count(), random_test_df.count()))

Random Selection: 16549, 4091
