In [8]:
# import pyspark
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('S2P_Linear').getOrCreate()
from pyspark.sql.functions import *

In [9]:
# Importing processed data which has a header. Schema is automatically configured.
df_join = spark.read.csv('Datasets/London_join.csv', header=True, inferSchema=True)

In [10]:
df_join = df_join.withColumn("lg_day_length", log10(col("day_length"))) \
                .withColumn("lg_windspeed", log10(col("windspeed")))

In [7]:
df_join.show()

+----------+------------+-----+---------+------+-----------+---------+----------+------------------+------------------+
|      date|avg_duration|count|feelslike|precip|precipcover|windspeed|day_length|     lg_day_length|      lg_windspeed|
+----------+------------+-----+---------+------+-----------+---------+----------+------------------+------------------+
|2017-01-23|     206.667|    9|      0.6| 0.199|       8.33|      6.4|     8.721|0.9405662864900902|0.8061799739838872|
|2017-01-24|     201.818|   11|      3.3| 0.001|       4.17|     11.5|      8.77|0.9429995933660404|1.0606978403536116|
|2017-01-25|     185.455|   11|      0.0| 0.001|       4.17|     11.3|      8.82|0.9454685851318197|1.0530784434834197|
|2017-01-26|       200.0|   12|     -3.4|   0.0|        0.0|     13.0|     8.871|  0.94797257924578|1.1139433523068367|
|2017-01-27|     189.231|   13|      0.8|   0.0|        0.0|     17.1|     8.923|0.9505108929859966|1.2329961103921538|
|2017-01-30|       195.0|    8|      6.3

**linear regression

In [12]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [25]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["feelslike", "precip", "precipcover",'lg_day_length','lg_windspeed'],
    outputCol="features")
output = assembler.transform(df_join)
all_data = output.select("features",'count')

In [14]:
all_data.show()

+--------------------+-----+
|            features|count|
+--------------------+-----+
|[0.6,0.199,8.33,0...|    9|
|[3.3,0.001,4.17,0...|   11|
|[0.0,0.001,4.17,0...|   11|
|[-3.4,0.0,0.0,0.9...|   12|
|[0.8,0.0,0.0,0.95...|   13|
|[6.3,9.0,4.17,0.9...|    8|
|[4.3,0.601,8.33,0...|    7|
|[10.1,7.99,8.33,0...|    9|
|[10.8,0.211,12.5,...|   12|
|[6.9,0.816,12.5,0...|   12|
|[3.4,0.0,0.0,0.97...|    7|
|[6.4,6.204,8.33,0...|    7|
|[2.8,0.0,0.0,0.98...|   10|
|[-0.9,0.0,0.0,0.9...|   10|
|[-1.4,0.401,16.67...|    7|
|[2.4,0.0,0.0,0.99...|    6|
|[5.5,0.0,0.0,0.99...|    4|
|[8.4,1.01,4.17,1....|    7|
|[7.5,0.2,8.33,1.0...|    9|
|[8.4,0.199,4.17,1...|    7|
+--------------------+-----+
only showing top 20 rows



In [15]:
all_data.count()

709

In [26]:
# randomised split 80/20, 
train_data,test_data = all_data.randomSplit([0.8,0.2])
# check data
train_data.describe().show()
test_data.describe().show()

+-------+------------------+
|summary|             count|
+-------+------------------+
|  count|               587|
|   mean|10.287904599659285|
| stddev|3.4887298124380184|
|    min|                 1|
|    max|                21|
+-------+------------------+

+-------+------------------+
|summary|             count|
+-------+------------------+
|  count|               122|
|   mean|10.024590163934427|
| stddev|3.5593274384520073|
|    min|                 1|
|    max|                21|
+-------+------------------+



In [27]:
# Create Linear model
lr = LinearRegression(labelCol='count')
lrModel = lr.fit(train_data)
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [0.10894935469332905,-0.1083807653252574,-0.0411606391083717,-0.7741726322925219,-0.5076995104096093] Intercept: 10.806223979015297


In [28]:
# Let's evaluate the model against the test data.
test_results = lrModel.evaluate(test_data)
# Interesting results! This shows the difference between the predicted value and the test data.
test_results.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results.rootMeanSquaredError))

+-------------------+
|          residuals|
+-------------------+
| 1.2124684282651401|
| 0.6372197173368015|
|-0.3269303698953223|
| 1.3863320259250997|
| 0.5584226217945485|
| 3.7347842914513674|
|0.43144357330467464|
| 1.7103261637851475|
|-2.5069097879731626|
|-3.5716418678388706|
|0.22469750373898734|
| -2.791807178010705|
| 1.2105954526851939|
|  2.310505404596409|
| 1.2576607043093233|
| -1.767439134890724|
| -2.760176326702947|
|-1.4151242338182435|
| -4.113534054077174|
|-2.5311501638258864|
+-------------------+
only showing top 20 rows

RSME: 3.5145961285233907


In [20]:
# We can also get the R2 value. 
print("R2: {}".format(test_results.r2))

R2: 0.05580228994259573
