In [1]:
# import pyspark
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('3y_Linear').getOrCreate()
from pyspark.sql.functions import *

In [13]:
# Importing processed data which has a header. Schema is automatically configured.
df_join = spark.read.csv('Datasets/London_3y.csv', header=True, inferSchema=True)

In [14]:
df_join = df_join.withColumn("lg_day_length", log10(col("day_length"))) \
                .withColumn("lg_windspeed", log10(col("windspeed"))) \
                .withColumn("reuse_rate", col("count")/16000 )

In [15]:
df_join.show()

+----------+------------+-----+---------+------+-----------+---------+----------+------------------+------------------+----------+
|      date|avg_duration|count|feelslike|precip|precipcover|windspeed|day_length|     lg_day_length|      lg_windspeed|reuse_rate|
+----------+------------+-----+---------+------+-----------+---------+----------+------------------+------------------+----------+
|2017-01-23|     817.726|23031|      0.6| 0.199|       8.33|      6.4|     8.721|0.9405662864900902|0.8061799739838872| 1.4394375|
|2017-01-24|     851.122|26299|      3.3| 0.001|       4.17|     11.5|      8.77|0.9429995933660404|1.0606978403536116| 1.6436875|
|2017-01-25|     832.233|24937|      0.0| 0.001|       4.17|     11.3|      8.82|0.9454685851318197|1.0530784434834197| 1.5585625|
|2017-01-26|     803.408|23607|     -3.4|   0.0|        0.0|     13.0|     8.871|  0.94797257924578|1.1139433523068367| 1.4754375|
|2017-01-27|      848.22|23138|      0.8|   0.0|        0.0|     17.1|     8.923|0.

**linear regression

In [5]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [16]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["feelslike", "precip", "precipcover",'lg_day_length','lg_windspeed'],
    outputCol="features")
output = assembler.transform(df_join)
all_data = output.select("features",'reuse_rate')

In [17]:
all_data.show()

+--------------------+----------+
|            features|reuse_rate|
+--------------------+----------+
|[0.6,0.199,8.33,0...| 1.4394375|
|[3.3,0.001,4.17,0...| 1.6436875|
|[0.0,0.001,4.17,0...| 1.5585625|
|[-3.4,0.0,0.0,0.9...| 1.4754375|
|[0.8,0.0,0.0,0.95...|  1.446125|
|[6.3,9.0,4.17,0.9...|   1.54575|
|[4.3,0.601,8.33,0...| 1.3989375|
|[10.1,7.99,8.33,0...|  1.429375|
|[10.8,0.211,12.5,...|     1.581|
|[6.9,0.816,12.5,0...|  1.428625|
|[3.4,0.0,0.0,0.97...|   1.49775|
|[6.4,6.204,8.33,0...|   1.67275|
|[2.8,0.0,0.0,0.98...|    1.5865|
|[-0.9,0.0,0.0,0.9...| 1.4136875|
|[-1.4,0.401,16.67...| 1.1536875|
|[2.4,0.0,0.0,0.99...| 1.4888125|
|[5.5,0.0,0.0,0.99...| 1.6415625|
|[8.4,1.01,4.17,1....| 1.5025625|
|[7.5,0.2,8.33,1.0...|  1.642375|
|[8.4,0.199,4.17,1...| 1.5849375|
+--------------------+----------+
only showing top 20 rows



In [8]:
all_data.count()

714

In [18]:
# randomised split 80/20, 
train_data,test_data = all_data.randomSplit([0.8,0.2])
# check data
train_data.describe().show()
test_data.describe().show()

+-------+------------------+
|summary|        reuse_rate|
+-------+------------------+
|  count|               561|
|   mean|1.9549998885917994|
| stddev|0.4333416585300395|
|    min|         0.3890625|
|    max|          2.815375|
+-------+------------------+

+-------+------------------+
|summary|        reuse_rate|
+-------+------------------+
|  count|               153|
|   mean|1.9149031862745103|
| stddev|0.3991835527499089|
|    min|             0.802|
|    max|         2.8114375|
+-------+------------------+



In [19]:
# Create Linear model
lr = LinearRegression(labelCol='reuse_rate')
lrModel = lr.fit(train_data)
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [0.04654357616540764,-0.02278282081292886,-0.02281701069015454,0.08129121073762202,-0.22586333913577125] Intercept: 1.725049520484063


In [20]:
# Let's evaluate the model against the test data.
test_results = lrModel.evaluate(test_data)
# Interesting results! This shows the difference between the predicted value and the test data.
test_results.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results.rootMeanSquaredError))

+--------------------+
|           residuals|
+--------------------+
| 0.03398630168192285|
| 0.15249169151798436|
|-0.07426509284068605|
| 0.07435018162793772|
| 0.06153508956758991|
|   0.033190459209957|
|0.024898315165056806|
| 0.19686892429529879|
|-0.04563680938633...|
| -0.1127554871551959|
|  0.2139923073555099|
|-0.18533183237467643|
| 0.05052249356359484|
| 0.10850114280122858|
|  0.2905318251276885|
| 0.10438152813320056|
|  0.2233525661976048|
|   0.285680709517641|
|-0.29085545854661954|
| 0.09746395149772069|
+--------------------+
only showing top 20 rows

RSME: 0.17669776129965517


In [21]:
# We can also get the R2 value. 
print("R2: {}".format(test_results.r2))

R2: 0.8027737764287031
