In [1]:
import pyspark
import sys

import pyspark.sql.functions as fn

from pyspark.sql import SparkSession

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [7]:
# Lightweight PySpark local mode
spark = SparkSession \
        .builder \
        .appName("LocalMode") \
        .getOrCreate()

In [8]:
spark.stop()

In [2]:
spark.sparkContext.appName

'PySparkShell'

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [4]:
sys.version

'3.8.5 (default, Jul 28 2020, 12:59:40) \n[GCC 9.3.0]'

In [5]:
spark.version

'3.0.1'

# Load Data

In [6]:
%%time
# Local Mode讀取本機檔案
df = spark.read.csv('HouseVarCoFinal.csv', header=True, inferSchema=True)
# df = spark.read.csv('HouseVarCoFinal.csv', inferSchema=True)
df

CPU times: user 572 µs, sys: 2.24 ms, total: 2.81 ms
Wall time: 6.46 s


DataFrame[Address: string, Area: string, St: string, 交易年月日: int, year: int, 交易標的: string, 交易筆棟數: string, 建物型態: string, 建物現況格局.廳: int, 建物現況格局.房: int, 建物現況格局.衛: int, 建物現況格局.隔間: string, 有無管理組織: string, 總價元: double, 總坪數: double, 單價元坪: double, 車位數: int, floor: int, EightCount: int, ParkCount: int, FuneralCount: int, GasCount: int, CrimeCount: int, PoliceCount: int, busCount: int, subwayCount: int, govCount: int, clinicCount: int, hospitalCount: int, pharmacyCount: int, fireareaCount: int, firewayCount: int, martCount: int, mallCount: int, cinemaCount: int, 土地面積: double, 總人口數: int, 男性人數: int, 女性人數: int, 人口密度: int, 每戶人數: double, 每戶成年人數: double, 所得收入總計: int, 可支配所得: int, 消費支出: int, 儲蓄: int, 所得總額: int, Lontitude: double, Latitude: double]

In [7]:
%%time
df.describe().show()

+-------+-----------------------------+------+------+------------------+------------------+--------------------+---------------+--------+------------------+------------------+------------------+-----------------+------------+--------------------+-----------------+------------------+------------------+----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|summary|                      Address|  Area|    St|        交易年月日|              year|       

In [8]:
%%time
df.printSchema()

root
 |-- Address: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- St: string (nullable = true)
 |-- 交易年月日: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- 交易標的: string (nullable = true)
 |-- 交易筆棟數: string (nullable = true)
 |-- 建物型態: string (nullable = true)
 |-- 建物現況格局.廳: integer (nullable = true)
 |-- 建物現況格局.房: integer (nullable = true)
 |-- 建物現況格局.衛: integer (nullable = true)
 |-- 建物現況格局.隔間: string (nullable = true)
 |-- 有無管理組織: string (nullable = true)
 |-- 總價元: double (nullable = true)
 |-- 總坪數: double (nullable = true)
 |-- 單價元坪: double (nullable = true)
 |-- 車位數: integer (nullable = true)
 |-- floor: integer (nullable = true)
 |-- EightCount: integer (nullable = true)
 |-- ParkCount: integer (nullable = true)
 |-- FuneralCount: integer (nullable = true)
 |-- GasCount: integer (nullable = true)
 |-- CrimeCount: integer (nullable = true)
 |-- PoliceCount: integer (nullable = true)
 |-- busCount: integer (nullable = true)
 |-- subwayCount: int

In [9]:
# 句點會造成AnalysisException，所以要修改欄位名稱
# table.columns，如果欄位名稱是中文也會導致AnalysisException
dfDrop = df
dfDrop = dfDrop.withColumnRenamed("建物現況格局.廳","廳數")
dfDrop = dfDrop.withColumnRenamed("建物現況格局.房","房數")
dfDrop = dfDrop.withColumnRenamed("建物現況格局.衛","衛數")
dfDrop = dfDrop.withColumnRenamed("建物現況格局.隔間","隔間數")
dfDrop = dfDrop.withColumnRenamed("單價元坪","unitPrice")

dfDrop

DataFrame[Address: string, Area: string, St: string, 交易年月日: int, year: int, 交易標的: string, 交易筆棟數: string, 建物型態: string, 廳數: int, 房數: int, 衛數: int, 隔間數: string, 有無管理組織: string, 總價元: double, 總坪數: double, unitPrice: double, 車位數: int, floor: int, EightCount: int, ParkCount: int, FuneralCount: int, GasCount: int, CrimeCount: int, PoliceCount: int, busCount: int, subwayCount: int, govCount: int, clinicCount: int, hospitalCount: int, pharmacyCount: int, fireareaCount: int, firewayCount: int, martCount: int, mallCount: int, cinemaCount: int, 土地面積: double, 總人口數: int, 男性人數: int, 女性人數: int, 人口密度: int, 每戶人數: double, 每戶成年人數: double, 所得收入總計: int, 可支配所得: int, 消費支出: int, 儲蓄: int, 所得總額: int, Lontitude: double, Latitude: double]

In [10]:
dfDrop2 = dfDrop.select("廳數","房數","衛數","unitPrice","floor","ParkCount","GasCount","govCount","hospitalCount","firewayCount","martCount","每戶人數","所得總額")
dfDrop2.show()

+----+----+----+----------------+-----+---------+--------+--------+-------------+------------+---------+--------+--------+
|廳數|房數|衛數|       unitPrice|floor|ParkCount|GasCount|govCount|hospitalCount|firewayCount|martCount|每戶人數|所得總額|
+----+----+----+----------------+-----+---------+--------+--------+-------------+------------+---------+--------+--------+
|   2|   5|   3|246580.260178484|    7|        8|       7|      33|            4|          18|       11|    2.89| 1720988|
|   2|   2|   2|713658.438145298|    7|        8|       7|      33|            4|          18|       11|    2.89| 1720988|
|   0|   0|   0|262043.416225031|    7|        8|       7|      33|            4|          18|       11|    2.89| 1720988|
|   1|   1|   1|743902.890773758|    7|        8|       7|      33|            4|          18|       11|    2.89| 1720988|
|   0|   0|   0| 1343112.5819135|    5|        8|       7|      33|            4|          18|       11|    2.89| 1720988|
|   1|   1|   1|709845.7857740

In [11]:
yDf = dfDrop2.select("unitPrice")
xDf = dfDrop2.select("廳數","房數","衛數","floor","ParkCount","GasCount","govCount","hospitalCount","firewayCount","martCount","每戶人數","所得總額")

# Train and Test Data

In [12]:
dfDrop2.columns
xDf.columns

['廳數',
 '房數',
 '衛數',
 'floor',
 'ParkCount',
 'GasCount',
 'govCount',
 'hospitalCount',
 'firewayCount',
 'martCount',
 '每戶人數',
 '所得總額']

In [13]:
# vectorize all numerical columns into a single feature column
feature_cols = xDf.columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
dfDrop2 = assembler.transform(dfDrop2)

# convert text labels into indices
data = dfDrop2.select(['features', '單價元坪'])
label_indexer = StringIndexer(inputCol='單價元坪', outputCol='label').fit(data)
data = label_indexer.transform(data)

In [14]:
# validate the presence of dense vectors 
dfDrop2.printSchema()

root
 |-- 廳數: integer (nullable = true)
 |-- 房數: integer (nullable = true)
 |-- 衛數: integer (nullable = true)
 |-- unitPrice: double (nullable = true)
 |-- floor: integer (nullable = true)
 |-- ParkCount: integer (nullable = true)
 |-- GasCount: integer (nullable = true)
 |-- govCount: integer (nullable = true)
 |-- hospitalCount: integer (nullable = true)
 |-- firewayCount: integer (nullable = true)
 |-- martCount: integer (nullable = true)
 |-- 每戶人數: double (nullable = true)
 |-- 所得總額: integer (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
# view the details of dense vector
dfDrop2.select('features').show(5,False)

+-----------------------------------------------------------+
|features                                                   |
+-----------------------------------------------------------+
|[2.0,5.0,3.0,7.0,8.0,7.0,33.0,4.0,18.0,11.0,2.89,1720988.0]|
|[2.0,2.0,2.0,7.0,8.0,7.0,33.0,4.0,18.0,11.0,2.89,1720988.0]|
|[0.0,0.0,0.0,7.0,8.0,7.0,33.0,4.0,18.0,11.0,2.89,1720988.0]|
|[1.0,1.0,1.0,7.0,8.0,7.0,33.0,4.0,18.0,11.0,2.89,1720988.0]|
|[0.0,0.0,0.0,5.0,8.0,7.0,33.0,4.0,18.0,11.0,2.89,1720988.0]|
+-----------------------------------------------------------+
only showing top 5 rows



In [16]:
# only select the features and label column
data = dfDrop2.select(['features', 'unitPrice'])
data.show(10)

+--------------------+----------------+
|            features|       unitPrice|
+--------------------+----------------+
|[2.0,5.0,3.0,7.0,...|246580.260178484|
|[2.0,2.0,2.0,7.0,...|713658.438145298|
|[0.0,0.0,0.0,7.0,...|262043.416225031|
|[1.0,1.0,1.0,7.0,...|743902.890773758|
|[0.0,0.0,0.0,5.0,...| 1343112.5819135|
|[1.0,1.0,1.0,7.0,...|709845.785774059|
|[2.0,4.0,2.0,11.0...|849122.942206655|
|[2.0,4.0,2.0,11.0...|190912.323582579|
|[2.0,3.0,2.0,7.0,...|636806.518723994|
|[0.0,0.0,0.0,7.0,...|1843904.60526316|
+--------------------+----------------+
only showing top 10 rows



In [17]:
# size of model df
data.count(), len(data.columns)

(121820, 2)

In [18]:
# use Logistic Regression to train on the training set
train, test = data.randomSplit([0.80, 0.20], seed=40)

In [19]:
train.count(), len(train.columns)

(97325, 2)

In [20]:
test.count(), len(test.columns)

(24495, 2)

# Regression Model

## Linear Regression

In [21]:
# Build Linear Regression model 
from pyspark.ml.regression import LinearRegression
lin_Reg=LinearRegression(labelCol='unitPrice')

In [22]:
%%time
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train)
lr_model

CPU times: user 0 ns, sys: 9.13 ms, total: 9.13 ms
Wall time: 2.95 s


LinearRegressionModel: uid=LinearRegression_124adc01ff7a, numFeatures=12

In [23]:
lr_model.intercept

624414.5378985564

In [24]:
lr_model.coefficients

DenseVector([-17688.0852, -28887.8065, 27288.1996, 5439.9844, 5116.7124, -4057.657, -1530.4889, 2652.8833, 28.4652, 579.8327, -400740.8756, 0.6911])

In [25]:
training_predictions=lr_model.evaluate(train)
training_predictions

<pyspark.ml.regression.LinearRegressionSummary at 0x7f54c80f60d0>

In [26]:
print('MSE:\t',training_predictions.meanSquaredError)
print('RMSE:\t',training_predictions.meanSquaredError ** 0.5)
print('R2:\t',training_predictions.r2)

MSE:	 513629190518.81384
RMSE:	 716679.2800959253
R2:	 0.03170599571459565


In [27]:
data

DataFrame[features: vector, unitPrice: double]

In [28]:
data.select('features')

DataFrame[features: vector]

In [29]:
lr_predictions = lr_model.transform(test)

In [30]:
test.show()
test.count()

+--------------------+------------------+
|            features|         unitPrice|
+--------------------+------------------+
|[0.0,0.0,0.0,0.0,...|  621318.992307692|
|[0.0,0.0,0.0,0.0,...|          826447.5|
|[0.0,0.0,0.0,0.0,...|  1845231.87272727|
|[0.0,0.0,0.0,0.0,...|  1890255.12763354|
|[0.0,0.0,0.0,0.0,...|  2003509.09090909|
|[0.0,0.0,0.0,0.0,...|  3138093.67718639|
|[0.0,0.0,0.0,0.0,...|  1233039.17762504|
|[0.0,0.0,0.0,0.0,...|         4671225.0|
|[0.0,0.0,0.0,1.0,...|  1289.66622222222|
|[0.0,0.0,0.0,1.0,...|  4048094.80770616|
|[0.0,0.0,0.0,1.0,...|  4247479.18326693|
|[0.0,0.0,0.0,1.0,...|  2310873.73189401|
|[0.0,0.0,0.0,1.0,...|  3392120.06651885|
|[0.0,0.0,0.0,1.0,...|1.61050667481989E7|
|[0.0,0.0,0.0,2.0,...|  1800113.59223301|
|[0.0,0.0,0.0,2.0,...|  263147.462686567|
|[0.0,0.0,0.0,2.0,...|  633293.103448276|
|[0.0,0.0,0.0,2.0,...|  748763.306908267|
|[0.0,0.0,0.0,2.0,...|  948738.438072471|
|[0.0,0.0,0.0,2.0,...|  282235.164802475|
+--------------------+------------

24495

In [31]:
lr_predictions.select("prediction", "unitPrice", "features").show()
lr_predictions.select("prediction", "unitPrice", "features").count()

+------------------+------------------+--------------------+
|        prediction|         unitPrice|            features|
+------------------+------------------+--------------------+
|478902.89242850046|  621318.992307692|[0.0,0.0,0.0,0.0,...|
|478902.89242850046|          826447.5|[0.0,0.0,0.0,0.0,...|
|478902.89242850046|  1845231.87272727|[0.0,0.0,0.0,0.0,...|
|478902.89242850046|  1890255.12763354|[0.0,0.0,0.0,0.0,...|
|478902.89242850046|  2003509.09090909|[0.0,0.0,0.0,0.0,...|
|478902.89242850046|  3138093.67718639|[0.0,0.0,0.0,0.0,...|
| 580648.2882496977|  1233039.17762504|[0.0,0.0,0.0,0.0,...|
| 580648.2882496977|         4671225.0|[0.0,0.0,0.0,0.0,...|
|  752175.547722879|  1289.66622222222|[0.0,0.0,0.0,1.0,...|
|  484342.876838517|  4048094.80770616|[0.0,0.0,0.0,1.0,...|
|  484342.876838517|  4247479.18326693|[0.0,0.0,0.0,1.0,...|
| 640619.6161194312|  2310873.73189401|[0.0,0.0,0.0,1.0,...|
| 640619.6161194312|  3392120.06651885|[0.0,0.0,0.0,1.0,...|
| 640619.6161194312|1.61

24495

# Predict

In [32]:
data.show()

+--------------------+----------------+
|            features|       unitPrice|
+--------------------+----------------+
|[2.0,5.0,3.0,7.0,...|246580.260178484|
|[2.0,2.0,2.0,7.0,...|713658.438145298|
|[0.0,0.0,0.0,7.0,...|262043.416225031|
|[1.0,1.0,1.0,7.0,...|743902.890773758|
|[0.0,0.0,0.0,5.0,...| 1343112.5819135|
|[1.0,1.0,1.0,7.0,...|709845.785774059|
|[2.0,4.0,2.0,11.0...|849122.942206655|
|[2.0,4.0,2.0,11.0...|190912.323582579|
|[2.0,3.0,2.0,7.0,...|636806.518723994|
|[0.0,0.0,0.0,7.0,...|1843904.60526316|
|[2.0,4.0,2.0,11.0...|782834.297812279|
|[2.0,3.0,1.0,7.0,...|700241.231527094|
|[0.0,0.0,0.0,7.0,...|490958.910891089|
|[1.0,1.0,1.0,7.0,...|437336.138084633|
|[2.0,2.0,2.0,5.0,...|1599057.38940204|
|[1.0,1.0,1.0,7.0,...|483610.646002317|
|[1.0,1.0,1.0,7.0,...|479778.371681416|
|[2.0,4.0,3.0,7.0,...| 573314.55026455|
|[1.0,1.0,1.0,7.0,...|557896.440625959|
|[1.0,1.0,1.0,7.0,...|407746.557437408|
+--------------------+----------------+
only showing top 20 rows



In [33]:
# yPred = stack.predict(xDf)
yPred = lr_model.transform(data)
yPred.show()
type(yPred)

+--------------------+----------------+-----------------+
|            features|       unitPrice|       prediction|
+--------------------+----------------+-----------------+
|[2.0,5.0,3.0,7.0,...|246580.260178484|575308.9187140829|
|[2.0,2.0,2.0,7.0,...|713658.438145298|634684.1384578639|
|[0.0,0.0,0.0,7.0,...|262043.416225031|673259.5225795308|
|[1.0,1.0,1.0,7.0,...|743902.890773758|653971.8305186973|
|[0.0,0.0,0.0,5.0,...| 1343112.5819135|662379.5537594975|
|[1.0,1.0,1.0,7.0,...|709845.785774059|653971.8305186973|
|[2.0,4.0,2.0,11.0...|849122.942206655| 598668.463186538|
|[2.0,4.0,2.0,11.0...|190912.323582579| 598668.463186538|
|[2.0,3.0,2.0,7.0,...|636806.518723994|605796.3320021677|
|[0.0,0.0,0.0,7.0,...|1843904.60526316|673259.5225795308|
|[2.0,4.0,2.0,11.0...|782834.297812279| 598668.463186538|
|[2.0,3.0,1.0,7.0,...|700241.231527094|578508.1323788603|
|[0.0,0.0,0.0,7.0,...|490958.910891089|673259.5225795308|
|[1.0,1.0,1.0,7.0,...|437336.138084633|653971.8305186973|
|[2.0,2.0,2.0,

pyspark.sql.dataframe.DataFrame

In [34]:
dfNew = dfDrop.withColumnRenamed('unitPrice','unitPriceOrigin')

In [35]:
from pyspark.sql import SQLContext

In [36]:
result = df

SQLContext.registerDataFrameAsTable(dfNew,dfNew,"X")
SQLContext.registerDataFrameAsTable(yPred, yPred, "y")

result = spark.sql("select * from X join y on X.unitPriceOrigin=y.unitPrice")
result.show()

+-------------------------+------+------+----------+----+--------------------+---------------+--------+----+----+----+------+------------+-----------+---------+----------------+------+-----+----------+---------+------------+--------+----------+-----------+--------+-----------+--------+-----------+-------------+-------------+-------------+------------+---------+---------+-----------+--------+--------+--------+--------+--------+--------+------------+------------+----------+--------+------+--------+-----------+----------+--------------------+----------------+------------------+
|                  Address|  Area|    St|交易年月日|year|            交易標的|     交易筆棟數|建物型態|廳數|房數|衛數|隔間數|有無管理組織|     總價元|   總坪數| unitPriceOrigin|車位數|floor|EightCount|ParkCount|FuneralCount|GasCount|CrimeCount|PoliceCount|busCount|subwayCount|govCount|clinicCount|hospitalCount|pharmacyCount|fireareaCount|firewayCount|martCount|mallCount|cinemaCount|土地面積|總人口數|男性人數|女性人數|人口密度|每戶人數|每戶成年人數|所得收入總計|可支配所得|消費支出|  儲蓄|所得總額|  Lontitud

In [37]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

def updown(unit,pred):
    return round(unit-pred, 2)
    
#convert to a UDF Function by passing in the function and return type of function
udfud = F.udf(updown, DoubleType())

result = result.withColumn("漲跌", udfud("unitPrice","prediction"))
result.show()

+-------------------------+------+------+----------+----+--------------------+---------------+--------+----+----+----+------+------------+-----------+---------+----------------+------+-----+----------+---------+------------+--------+----------+-----------+--------+-----------+--------+-----------+-------------+-------------+-------------+------------+---------+---------+-----------+--------+--------+--------+--------+--------+--------+------------+------------+----------+--------+------+--------+-----------+----------+--------------------+----------------+------------------+----------+
|                  Address|  Area|    St|交易年月日|year|            交易標的|     交易筆棟數|建物型態|廳數|房數|衛數|隔間數|有無管理組織|     總價元|   總坪數| unitPriceOrigin|車位數|floor|EightCount|ParkCount|FuneralCount|GasCount|CrimeCount|PoliceCount|busCount|subwayCount|govCount|clinicCount|hospitalCount|pharmacyCount|fireareaCount|firewayCount|martCount|mallCount|cinemaCount|土地面積|總人口數|男性人數|女性人數|人口密度|每戶人數|每戶成年人數|所得收入總計|可支配所得|消費支出|  儲蓄|所得總額

In [38]:
result.printSchema()

root
 |-- Address: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- St: string (nullable = true)
 |-- 交易年月日: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- 交易標的: string (nullable = true)
 |-- 交易筆棟數: string (nullable = true)
 |-- 建物型態: string (nullable = true)
 |-- 廳數: integer (nullable = true)
 |-- 房數: integer (nullable = true)
 |-- 衛數: integer (nullable = true)
 |-- 隔間數: string (nullable = true)
 |-- 有無管理組織: string (nullable = true)
 |-- 總價元: double (nullable = true)
 |-- 總坪數: double (nullable = true)
 |-- unitPriceOrigin: double (nullable = true)
 |-- 車位數: integer (nullable = true)
 |-- floor: integer (nullable = true)
 |-- EightCount: integer (nullable = true)
 |-- ParkCount: integer (nullable = true)
 |-- FuneralCount: integer (nullable = true)
 |-- GasCount: integer (nullable = true)
 |-- CrimeCount: integer (nullable = true)
 |-- PoliceCount: integer (nullable = true)
 |-- busCount: integer (nullable = true)
 |-- subwayCount: integer (nullabl

In [39]:
result.head(2)

[Row(Address='臺北市中山區一江街1~30號', Area='中山區', St='一江街', 交易年月日=1041208, year=2015, 交易標的='房地(土地+建物)', 交易筆棟數='土地1建物1車位0', 建物型態='華廈', 廳數=2, 房數=5, 衛數=3, 隔間數='有', 有無管理組織='無', 總價元=16800000.0, 總坪數=68.132075, unitPriceOrigin=246580.260178484, 車位數=0, floor=7, EightCount=77, ParkCount=8, FuneralCount=132, GasCount=7, CrimeCount=904, PoliceCount=13, busCount=280, subwayCount=8, govCount=33, clinicCount=486, hospitalCount=4, pharmacyCount=98, fireareaCount=80, firewayCount=18, martCount=11, mallCount=6, cinemaCount=5, 土地面積=13.6821, 總人口數=224707, 男性人數=104138, 女性人數=120569, 人口密度=16423, 每戶人數=2.89, 每戶成年人數=2.51, 所得收入總計=1614178, 可支配所得=1339702, 消費支出=1071429, 儲蓄=268273, 所得總額=1720988, Lontitude=121.5315679, Latitude=25.0507038, features=DenseVector([2.0, 5.0, 3.0, 7.0, 8.0, 7.0, 33.0, 4.0, 18.0, 11.0, 2.89, 1720988.0]), unitPrice=246580.260178484, prediction=575308.9187140829, 漲跌=-328728.66),
 Row(Address='臺北市中山區一江街1~30號', Area='中山區', St='一江街', 交易年月日=1080827, year=2019, 交易標的='房地(土地+建物)', 交易筆棟數='土地2建物1車位0', 建物型態=

In [40]:
updown = result.groupby(['Area']).mean()
updown.printSchema()

root
 |-- Area: string (nullable = true)
 |-- avg(交易年月日): double (nullable = true)
 |-- avg(year): double (nullable = true)
 |-- avg(廳數): double (nullable = true)
 |-- avg(房數): double (nullable = true)
 |-- avg(衛數): double (nullable = true)
 |-- avg(總價元): double (nullable = true)
 |-- avg(總坪數): double (nullable = true)
 |-- avg(unitPriceOrigin): double (nullable = true)
 |-- avg(車位數): double (nullable = true)
 |-- avg(floor): double (nullable = true)
 |-- avg(EightCount): double (nullable = true)
 |-- avg(ParkCount): double (nullable = true)
 |-- avg(FuneralCount): double (nullable = true)
 |-- avg(GasCount): double (nullable = true)
 |-- avg(CrimeCount): double (nullable = true)
 |-- avg(PoliceCount): double (nullable = true)
 |-- avg(busCount): double (nullable = true)
 |-- avg(subwayCount): double (nullable = true)
 |-- avg(govCount): double (nullable = true)
 |-- avg(clinicCount): double (nullable = true)
 |-- avg(hospitalCount): double (nullable = true)
 |-- avg(pharmacyCount): do

In [41]:
updown = updown.select('Area', 'avg(unitPrice)','avg(prediction)','avg(漲跌)')
updown.show()

+------+------------------+-----------------+-------------------+
|  Area|    avg(unitPrice)|  avg(prediction)|          avg(漲跌)|
+------+------------------+-----------------+-------------------+
|南港區| 526077.3259428595|555030.1649174213|-28952.838981944456|
|北投區|472527.72861826356|493762.2551277943|-21234.526502513792|
|內湖區| 523306.6956624151|528032.3343385347| -4725.638680766488|
|萬華區|470096.72368203645|480465.0293231422|-10368.305678571442|
|文山區|470894.20013887255|537407.0058019717| -66512.80569517576|
|松山區| 688876.4291906742|717199.8266238418|-28323.397467711307|
|信義區| 686091.2646814411|676752.0335638351|   9339.23110389612|
|大同區|  552593.240185469|525934.3036345702| 26658.936528421895|
|中正區| 727578.6470990784|728087.6818519549|  -509.034730946867|
|中山區| 636868.5271113252|638104.9373317077|-1236.4102416614692|
|大安區| 816875.8412153312|807816.6971096632|  9059.144127488886|
|士林區| 564129.9154914792|566750.4101115301| -2620.494688226875|
+------+------------------+-----------------+---

In [44]:
%%time
#Local Mode
updown.orderBy('avg(漲跌)').show()

+------+------------------+-----------------+-------------------+
|  Area|    avg(unitPrice)|  avg(prediction)|          avg(漲跌)|
+------+------------------+-----------------+-------------------+
|文山區|470894.20013887255|537407.0058019717| -66512.80569517576|
|南港區| 526077.3259428595|555030.1649174213|-28952.838981944456|
|松山區| 688876.4291906742|717199.8266238418|-28323.397467711307|
|北投區|472527.72861826356|493762.2551277943|-21234.526502513792|
|萬華區|470096.72368203645|480465.0293231422|-10368.305678571442|
|內湖區| 523306.6956624151|528032.3343385347| -4725.638680766488|
|士林區| 564129.9154914792|566750.4101115301| -2620.494688226875|
|中山區| 636868.5271113252|638104.9373317077|-1236.4102416614692|
|中正區| 727578.6470990784|728087.6818519549|  -509.034730946867|
|大安區| 816875.8412153312|807816.6971096632|  9059.144127488886|
|信義區| 686091.2646814411|676752.0335638351|   9339.23110389612|
|大同區|  552593.240185469|525934.3036345702| 26658.936528421895|
+------+------------------+-----------------+---

In [None]:
spark.stop()