In [1]:
# import pyspark
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('3y_GBT').getOrCreate()
from pyspark.sql.functions import *

In [2]:
# Importing processed data which has a header. Schema is automatically configured.
df_join = spark.read.csv('Datasets/London_3y.csv', header=True, inferSchema=True)

In [32]:
df_join = df_join.withColumn("lg_day_length", log10(col("day_length"))) \
                .withColumn("lg_windspeed", log10(col("windspeed"))) \
                .withColumn("reuse_rate", col("count")/16000 )

In [33]:
df_join.show()

+----------+------------+-----+---------+------+-----------+---------+----------+------------------+------------------+----------+
|      date|avg_duration|count|feelslike|precip|precipcover|windspeed|day_length|     lg_day_length|      lg_windspeed|reuse_rate|
+----------+------------+-----+---------+------+-----------+---------+----------+------------------+------------------+----------+
|2017-01-23|     817.726|23031|      0.6| 0.199|       8.33|      6.4|     8.721|0.9405662864900902|0.8061799739838872| 1.4394375|
|2017-01-24|     851.122|26299|      3.3| 0.001|       4.17|     11.5|      8.77|0.9429995933660404|1.0606978403536116| 1.6436875|
|2017-01-25|     832.233|24937|      0.0| 0.001|       4.17|     11.3|      8.82|0.9454685851318197|1.0530784434834197| 1.5585625|
|2017-01-26|     803.408|23607|     -3.4|   0.0|        0.0|     13.0|     8.871|  0.94797257924578|1.1139433523068367| 1.4754375|
|2017-01-27|      848.22|23138|      0.8|   0.0|        0.0|     17.1|     8.923|0.

**GBT regression

In [6]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [34]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["feelslike", "precip", "precipcover",'lg_day_length','lg_windspeed'],
    outputCol="features")
output = assembler.transform(df_join)
all_data = output.select("features",'reuse_rate')

In [35]:
all_data.show()

+--------------------+----------+
|            features|reuse_rate|
+--------------------+----------+
|[0.6,0.199,8.33,0...| 1.4394375|
|[3.3,0.001,4.17,0...| 1.6436875|
|[0.0,0.001,4.17,0...| 1.5585625|
|[-3.4,0.0,0.0,0.9...| 1.4754375|
|[0.8,0.0,0.0,0.95...|  1.446125|
|[6.3,9.0,4.17,0.9...|   1.54575|
|[4.3,0.601,8.33,0...| 1.3989375|
|[10.1,7.99,8.33,0...|  1.429375|
|[10.8,0.211,12.5,...|     1.581|
|[6.9,0.816,12.5,0...|  1.428625|
|[3.4,0.0,0.0,0.97...|   1.49775|
|[6.4,6.204,8.33,0...|   1.67275|
|[2.8,0.0,0.0,0.98...|    1.5865|
|[-0.9,0.0,0.0,0.9...| 1.4136875|
|[-1.4,0.401,16.67...| 1.1536875|
|[2.4,0.0,0.0,0.99...| 1.4888125|
|[5.5,0.0,0.0,0.99...| 1.6415625|
|[8.4,1.01,4.17,1....| 1.5025625|
|[7.5,0.2,8.33,1.0...|  1.642375|
|[8.4,0.199,4.17,1...| 1.5849375|
+--------------------+----------+
only showing top 20 rows



In [12]:
all_data.count()

714

In [36]:
# randomised split 80/20, 
train_data,test_data = all_data.randomSplit([0.8,0.2])
# check data
train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|         reuse_rate|
+-------+-------------------+
|  count|                577|
|   mean|  1.937683817157713|
| stddev|0.42791477130391914|
|    min|          0.3890625|
|    max|          2.8114375|
+-------+-------------------+

+-------+------------------+
|summary|        reuse_rate|
+-------+------------------+
|  count|               137|
|   mean|1.9831500912408766|
| stddev|0.4189698680944552|
|    min|         0.4859375|
|    max|          2.815375|
+-------+------------------+



In [37]:
# Create GBT model
gbt = GBTRegressor(featuresCol='features', labelCol='reuse_rate', seed=42)

#Train the model
gbtModel = gbt.fit(train_data)


In [39]:
# Make predictions on the testing data
predictions = gbtModel.transform(test_data)

# Evaluate the model
rmse_eval = RegressionEvaluator(labelCol='reuse_rate', metricName='rmse')
mae_eval = RegressionEvaluator(labelCol='reuse_rate', metricName='mae')

rmse = rmse_eval.evaluate(predictions)
mae = mae_eval.evaluate(predictions)


print("RMSE: {:.2f}".format(rmse))
print("MAE: {:.2f}".format(mae))

RMSE: 0.20
MAE: 0.15


In [42]:
predictions.show()

+--------------------+----------+------------------+
|            features|reuse_rate|        prediction|
+--------------------+----------+------------------+
|[-3.2,0.605,8.33,...|   1.26225|1.5808476229678567|
|[-2.6,6.38,16.67,...|  0.794375| 1.405889499265029|
|[-1.9,0.199,4.17,...|   1.44975|1.4629250287033624|
|[-1.3,0.0,0.0,0.9...| 1.5033125|1.4954840198405017|
|[-1.1,0.0,0.0,1.0...| 1.4120625| 1.394880881621903|
|[-0.9,0.0,0.0,0.9...| 1.4136875|1.5161508976800702|
|[-0.3,0.0,0.0,0.9...|   1.59775| 1.472921068803218|
|[-0.3,0.0,0.0,0.9...|  1.515375|1.4580459388465605|
|[-0.2,1.995,12.5,...| 1.5729375| 1.444946460142404|
|[1.0,0.796,8.33,0...| 1.5058125|1.3278126445019733|
|[2.2,0.199,4.17,0...| 1.7339375|1.6185928898117032|
|[2.8,0.0,0.0,0.98...|    1.5865|1.6145273001827798|
|[3.1,3.997,8.33,0...| 1.6931875| 1.443325872088565|
|[3.1,5.006,16.67,...| 1.2370625|1.4157654429894178|
|[3.2,1.996,4.17,0...|   1.69125|1.3454466583566262|
|[3.3,0.001,4.17,0...|  1.814875|1.62180807392

In [17]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import *
from pyspark.sql.types import *

**Hyperparameter Tuning and Model Selection

In Gradient Boosting, the main hyperparameters are the number of trees, the learning rate, and the maximum depth of each tree. We can use cross-validation and grid search to find the best hyperparameters.

In [40]:
# Define the hyperparameter grid
param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 4, 6]) \
    .addGrid(gbt.maxIter, [10, 50, 100]) \
    .addGrid(gbt.stepSize, [0.1, 0.01]) \
    .build()

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='reuse_rate', metricName='rmse')

# Define the cross-validator
crossval = CrossValidator(estimator=gbt,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=5, seed=42)

# Train the model using cross-validation
cv_model = crossval.fit(train_data)

# Make predictions on the testing data
cv_predictions = cv_model.transform(test_data)

cv_rmse = evaluator.evaluate(cv_predictions)
print("CV RMSE: {:.2f}".format(cv_rmse))

CV RMSE: 0.18


In [41]:
cv_mae = mae_eval.evaluate(cv_predictions)
print("CV MAE: {:.2f}".format(cv_mae))

CV MAE: 0.15


In [29]:
cv_predictions.show()

+--------------------+-----+------------------+
|            features|count|        prediction|
+--------------------+-----+------------------+
|[-3.4,0.0,0.0,0.9...|23607| 23410.06621950035|
|[-1.6,0.0,0.0,0.9...|25600|22309.128984633393|
|[-1.1,0.0,0.0,1.0...|22593|19447.933487223792|
|[0.0,0.001,4.17,0...|24937|25755.448353089992|
|[0.8,0.002,4.17,1...|25686|24400.436385610596|
|[1.5,5.984,8.33,0...|20721|22541.583977819195|
|[1.8,0.0,0.0,0.89...|23763|26542.366886791933|
|[2.8,0.0,0.0,0.98...|25384| 27521.75800419014|
|[2.9,0.0,0.0,0.93...|28392| 27455.81374323029|
|[2.9,6.013,8.33,0...|19501|22161.254822247927|
|[3.4,2.401,8.33,0...|23812| 22279.15576935453|
|[3.6,1.0,8.33,1.1...|28083|23425.577601192206|
|[3.8,0.0,0.0,1.02...|26383| 28157.80480433735|
|[3.8,0.051,4.17,0...|29152|27196.161209085585|
|[4.2,0.202,4.17,0...|17772| 26364.18870716653|
|[4.6,0.8,8.33,0.9...|26776|21934.347327636067|
|[4.6,0.804,8.33,1...|26775| 23026.56925040686|
|[4.9,0.199,4.17,0...|25362|26550.408719

In [28]:
# Get feature importance scores
importances = gbtModel.featureImportances

# Create a list of feature names
features = ["feelslike", "precip", "precipcover",'lg_day_length','lg_windspeed']

# Print the feature importance scores
for feature, importance in zip(features, importances):
    print(feature, "{:.4f}".format(importance))

feelslike 0.4033
precip 0.2416
precipcover 0.0529
lg_day_length 0.1623
lg_windspeed 0.1398


# step 8

In [43]:
df_akl = spark.read.csv('Datasets/Auckland_weather.csv', header=True, inferSchema=True)

In [51]:
df_akl = df_akl.withColumn("lg_day_length", log10(col("day_length"))) \
                .withColumn("lg_windspeed", log10(col("windspeed")))
akl_data = assembler.transform(df_akl).select("features")


In [52]:
# Predict the re_use value for Auckland weather condition data
akl_predict = cv_model.transform(akl_data)
akl_predict.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[19.9,0.0,0.0,1.1...|2.2462594796465902|
|[21.7,0.0,0.0,1.1...|2.3676553698113074|
|[21.0,0.0,0.0,1.1...| 2.385366416668374|
|[22.2,0.0,0.0,1.1...|2.4118573012833533|
|[21.2,0.008,4.17,...| 2.334764741124993|
|[21.5,0.0,0.0,1.1...|2.3676553698113074|
|[21.0,0.116,12.5,...|2.2446821026870216|
|[21.4,0.025,8.33,...| 2.168558368074246|
|[22.2,0.0,0.0,1.1...|2.2467896566980707|
|[22.2,0.0,0.0,1.1...|2.3628263485947874|
|[22.4,0.0,0.0,1.1...|2.3186244171227415|
|[21.6,0.616,8.33,...|2.0084027611228725|
|[21.5,0.0,0.0,1.1...|2.1910802755355094|
|[20.2,0.041,8.33,...|2.3672699320101174|
|[20.6,0.108,12.5,...|2.3672699320101174|
|[20.5,0.0,0.0,1.1...|2.2529379081205856|
|[20.5,0.0,0.0,1.1...|2.1972285269580243|
|[21.5,0.0,0.0,1.1...|2.3186244171227415|
|[21.9,0.0,0.0,1.1...|2.3186244171227415|
|[20.0,0.192,4.17,...|2.3605558461823337|
+--------------------+------------

In [111]:
import matplotlib.pyplot as plt
df_AKL_stat = akl_predict.describe()
df_LND_stat = all_data.describe()
df_LND_stat = df_LND_stat.withColumnRenamed("summary", "sum")
df_compare = df_AKL_stat.join(df_LND_stat,df_AKL_stat["summary"] == df_LND_stat["sum"], how="inner" )
df_compare = df_compare.drop("sum").where(df_compare["Summary"] != 'count')
#df_compare.drop(index='count').rename(columns={'prediction': 'Auckland', 'reuse_rate': 'London'}).boxplot()
df_compare = df_compare.withColumnRenamed("prediction", "Auckland").withColumnRenamed("reuse_rate", "London")
df_compare.toPandas()

Unnamed: 0,summary,Auckland,London
0,mean,1.978110427860641,1.9464077380952371
1,stddev,0.2706933482941951,0.4262983491379373
2,min,1.3563069523671416,0.3890625
3,max,2.4734609160919554,2.815375
