In [1]:
import os
import sys

os.environ["JAVA_HOME"] = "JDK 8/Contents/Home"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

**Step 1:** Load Data and Initialize Spark Session

In [3]:
# Create SparkSession
spark = SparkSession \
    .builder \
        .appName("MultipleLinearRegression") \
            .getOrCreate()

25/03/22 13:49:33 WARN Utils: Your hostname, Toms-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.180.174.193 instead (on interface en0)
25/03/22 13:49:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/22 13:49:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


**Step 2:** Data Preparation

+ **Select Relevant Columns:** We will filter only the relevant columns for the regression model.
+ **Handle Missing Values:** We gonna drop rows or fill missing values in predictors or input columns.
+ **Assemble Features:** We'll combine predictors into a single features vector using VectorAssembler.

In [None]:
#Find and read the file and show the table in pyspark
path = "/Users/tomdursley/Downloads/FuelConsumption.csv"

Fuel_sparkdf = spark.read.csv(path, header=True, inferSchema=True)

Fuel_sparkdf.show(5)

                                                                                

+---------+-----+----------+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+
|MODELYEAR| MAKE|     MODEL|VEHICLECLASS|ENGINESIZE|CYLINDERS|TRANSMISSION|FUELTYPE|FUELCONSUMPTION_CITY|FUELCONSUMPTION_HWY|FUELCONSUMPTION_COMB|FUELCONSUMPTION_COMB_MPG|CO2EMISSIONS|
+---------+-----+----------+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+------------+
|     2014|ACURA|       ILX|     COMPACT|       2.0|        4|         AS5|       Z|                 9.9|                6.7|                 8.5|                      33|         196|
|     2014|ACURA|       ILX|     COMPACT|       2.4|        4|          M6|       Z|                11.2|                7.7|                 9.6|                      29|         221|
|     2014|ACURA|ILX HYBRID|     COMPACT|       1.5|        4|         AV7|

In [5]:
# Select relevant features for predicting CO2EMISSIONS

#ENGINESIZE, FUELCONSUMPTION_COMB, CO2EMISSIONS, CYLINDERS

Fuel_dfMLR = Fuel_sparkdf.select("FUELCONSUMPTION_COMB", "ENGINESIZE", "CO2EMISSIONS", "CYLINDERS")

Fuel_dfMLR.show(5)

+--------------------+----------+------------+---------+
|FUELCONSUMPTION_COMB|ENGINESIZE|CO2EMISSIONS|CYLINDERS|
+--------------------+----------+------------+---------+
|                 8.5|       2.0|         196|        4|
|                 9.6|       2.4|         221|        4|
|                 5.9|       1.5|         136|        4|
|                11.1|       3.5|         255|        6|
|                10.6|       3.5|         244|        6|
+--------------------+----------+------------+---------+
only showing top 5 rows



In [6]:
# Handle missing values (if any)

null_count = Fuel_dfMLR.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in Fuel_dfMLR.columns])

print("the number of null values is: ")

null_count.show()

the number of null values is: 
+--------------------+----------+------------+---------+
|FUELCONSUMPTION_COMB|ENGINESIZE|CO2EMISSIONS|CYLINDERS|
+--------------------+----------+------------+---------+
|                   0|         0|           0|        0|
+--------------------+----------+------------+---------+



In [None]:
#combine the data from all tables and create new table with that data

assembler = VectorAssembler(inputCols=["FUELCONSUMPTION_COMB", "ENGINESIZE","CYLINDERS"], outputCol="features")

assembled_data = assembler.transform(Fuel_dfMLR)

assembled_data.show(5, truncate=False)

+--------------------+----------+------------+---------+--------------+
|FUELCONSUMPTION_COMB|ENGINESIZE|CO2EMISSIONS|CYLINDERS|features      |
+--------------------+----------+------------+---------+--------------+
|8.5                 |2.0       |196         |4        |[8.5,2.0,4.0] |
|9.6                 |2.4       |221         |4        |[9.6,2.4,4.0] |
|5.9                 |1.5       |136         |4        |[5.9,1.5,4.0] |
|11.1                |3.5       |255         |6        |[11.1,3.5,6.0]|
|10.6                |3.5       |244         |6        |[10.6,3.5,6.0]|
+--------------------+----------+------------+---------+--------------+
only showing top 5 rows



**Step 3:** Train-Test Split

In [None]:
#create a random split of 80 - 20 ratio for testing
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=42)

**Step 4:** Train the Multiple Linear Regression Model

In [9]:
MLR = LinearRegression(featuresCol="features", labelCol="CO2EMISSIONS")

MLR_MODEL = MLR.fit(train_data)

25/03/22 13:52:38 WARN Instrumentation: [bf4cfd3c] regParam is zero, which might cause numerical instability and overfitting.
25/03/22 13:52:39 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/03/22 13:52:39 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [10]:
print("Coefficients: ", MLR_MODEL.coefficients)
print("Intercept: ", MLR_MODEL.intercept)

Coefficients:  [9.583022473094907,11.457425902199605,7.164466396524107]
Intercept:  65.61437182897156


CO2EMISSIONS = 52.1313 + (10.7770 × FUELCONSUMPTION_COMB) + (13.7017 × CYLINDERS)

**Step 5:** Make Predictions

In [11]:
MLR_predictions = MLR_MODEL.transform(test_data)

MLR_predictions.show(5)

+--------------------+----------+------------+---------+-------------+------------------+
|FUELCONSUMPTION_COMB|ENGINESIZE|CO2EMISSIONS|CYLINDERS|     features|        prediction|
+--------------------+----------+------------+---------+-------------+------------------+
|                 4.8|       2.0|         110|        4|[4.8,2.0,4.0]|163.18559709032274|
|                 5.6|       1.8|         129|        4|[5.6,1.8,4.0]|168.56052988835876|
|                 5.9|       1.5|         136|        4|[5.9,1.5,4.0]|167.99820885962737|
|                 6.2|       1.3|         143|        4|[6.2,1.3,4.0]| 168.5816304211159|
|                 6.4|       1.0|         147|        3|[6.4,1.0,3.0]|159.89654074855088|
+--------------------+----------+------------+---------+-------------+------------------+
only showing top 5 rows



MLR evaluation

In [12]:
MLR_evaluator_r2 = RegressionEvaluator(labelCol="CO2EMISSIONS",
                                       predictionCol="prediction",
                                       metricName="r2")


MLR_r2 = MLR_evaluator_r2.evaluate(MLR_predictions)

print("The Coefficient of Determination is:", MLR_r2)

The Coefficient of Determination is: 0.83913128544792


In [13]:
MLR_evaluator_rmse = RegressionEvaluator(labelCol="CO2EMISSIONS",
                                       predictionCol="prediction",
                                       metricName="rmse")


MLR_rmse = MLR_evaluator_rmse.evaluate(MLR_predictions)

print("The Coefficient of Determination is:", MLR_rmse)

The Coefficient of Determination is: 26.853368115055336
