In [1]:
# import pyspark
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('8am_Linear').getOrCreate()
from pyspark.sql.functions import *

In [2]:
# Importing processed data which has a header. Schema is automatically configured.
df = spark.read.csv('Datasets/London_8AM.csv', header=True, inferSchema=True)

In [3]:
df.show()

+----------+---------+------+---------+------------+-----+
|      date|feelslike|precip|windspeed|avg_duration|count|
+----------+---------+------+---------+------------+-----+
|2018-11-01|     10.6|   0.0|      7.1|     827.339| 3431|
|2018-11-02|      0.3|   0.0|      6.0|     848.992| 4137|
|2018-11-05|      7.6|   0.0|      9.9|     904.694| 4678|
|2018-11-06|     11.1|   0.0|      6.9|     875.529| 4943|
|2018-11-07|     12.1|   0.0|     27.0|     852.818| 3637|
|2018-11-08|      7.6|   0.0|     13.1|     881.522| 4796|
|2018-11-09|      7.2|   0.0|     12.5|     865.716| 4272|
|2018-11-12|     11.0|   0.0|     11.2|     938.312| 4371|
|2018-11-13|      6.6|   0.0|     16.1|      890.45| 4894|
|2018-11-14|     11.0|   0.0|     13.5|     869.605| 4858|
|2018-11-15|     12.5|   0.0|      6.6|     881.563| 4927|
|2018-11-16|     10.5|   0.0|      6.7|     845.019| 4124|
|2018-11-19|      5.5|   0.0|     16.3|     873.782| 3403|
|2018-11-20|      1.3|   0.0|     19.8|      829.02| 332

**linear regression

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [6]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["feelslike", "precip", "windspeed"],
    outputCol="features")
output = assembler.transform(df)
all_data = output.select("features",'count')

In [7]:
all_data.show()

+---------------+-----+
|       features|count|
+---------------+-----+
| [10.6,0.0,7.1]| 3431|
|  [0.3,0.0,6.0]| 4137|
|  [7.6,0.0,9.9]| 4678|
| [11.1,0.0,6.9]| 4943|
|[12.1,0.0,27.0]| 3637|
| [7.6,0.0,13.1]| 4796|
| [7.2,0.0,12.5]| 4272|
|[11.0,0.0,11.2]| 4371|
| [6.6,0.0,16.1]| 4894|
|[11.0,0.0,13.5]| 4858|
| [12.5,0.0,6.6]| 4927|
| [10.5,0.0,6.7]| 4124|
| [5.5,0.0,16.3]| 3403|
| [1.3,0.0,19.8]| 3328|
|[-1.1,0.0,10.9]| 3888|
|  [3.1,0.0,4.6]| 3977|
|  [5.1,0.0,7.9]| 3678|
| [4.5,0.0,10.5]| 4236|
|  [2.1,0.0,8.0]| 4506|
|[12.1,0.0,17.7]| 3566|
+---------------+-----+
only showing top 20 rows



In [8]:
all_data.count()

22

In [42]:
# randomised split 50/50, we may use full data to train again later
train_data,test_data = all_data.randomSplit([0.5,0.5])
# check data
train_data.describe().show()
test_data.describe().show()

+-------+-----------------+
|summary|            count|
+-------+-----------------+
|  count|               11|
|   mean|4147.363636363636|
| stddev|542.8068298625715|
|    min|             3431|
|    max|             4927|
+-------+-----------------+

+-------+-----------------+
|summary|            count|
+-------+-----------------+
|  count|               11|
|   mean|4115.818181818182|
| stddev|601.4555375390302|
|    min|             3328|
|    max|             4943|
+-------+-----------------+



In [43]:
# Create Linear model
lr = LinearRegression(labelCol='count')
lrModel = lr.fit(train_data)
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [30.54134236103077,0.0,12.14332779657959] Intercept: 3776.702066044853


In [44]:
# Let's evaluate the model against the test data.
test_results = lrModel.evaluate(test_data)
# Interesting results! This shows the difference between the predicted value and the test data.
test_results.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results.rootMeanSquaredError))
# We can also get the R2 value. 
print("R2: {}".format(test_results.r2))

+------------------+
|         residuals|
+------------------+
|278.27556446736025|
| -728.843701486469|
| 568.0144926243456|
|194.35695146642274|
|-739.6156921147694|
|123.60867149848036|
|  548.964786825175|
| 628.1061378761206|
| 743.5000719513064|
|-837.1221591209742|
|-1229.183337347432|
+------------------+

RSME: 673.6409345550331
R2: -0.37988396853450546


In [46]:
# Train by all data
lrModel_all = lr.fit(all_data)
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel_all.coefficients,lrModel_all.intercept))

Coefficients: [38.63790485633872,0.0,-43.57538636865492] Intercept: 4404.454922790508


In [48]:
# Let's evaluate the model against the test data.
test_results_all = lrModel_all.evaluate(test_data)
# Interesting results! This shows the difference between the predicted value and the test data.
test_results_all.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results_all.rootMeanSquaredError))
# We can also get the R2 value. 
print("R2: {}".format(test_results_all.r2))

+-------------------+
|          residuals|
+-------------------+
| -17.59397603548041|
|  -263.891549004381|
|  369.0085679604199|
| 115.21606222684386|
|-503.68460169129594|
| 134.04449185203976|
| 411.29332535100093|
|  668.7345617306974|
| 410.33449924785145|
|-58.438139598523776|
|-156.90516757849218|
+-------------------+

RSME: 344.2822874811397
R2: 0.6395749317566863
