# 1. Import necessary libraries #

In [1]:
from pyspark.sql import SparkSession

#import functions/Classes for sparkml

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# import functions/Classes for metrics
from pyspark.ml.evaluation import RegressionEvaluator



# 2. Create spark session and load data into dataframe #

In [2]:
# Create Spark Session
spark = SparkSession.builder.appName("LinearRegressionwithSpark").getOrCreate()

24/02/15 01:21:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Read data in csv file into dataframe
df_mpg = spark.read.csv("../dataset/mpg.csv", header = True, inferSchema = True)
df_mpg.head(5)

[Row(MPG=15.0, Cylinders=8, Engine Disp=390.0, Horsepower=190, Weight=3850, Accelerate=8.5, Year=70, Origin='American'),
 Row(MPG=21.0, Cylinders=6, Engine Disp=199.0, Horsepower=90, Weight=2648, Accelerate=15.0, Year=70, Origin='American'),
 Row(MPG=18.0, Cylinders=6, Engine Disp=199.0, Horsepower=97, Weight=2774, Accelerate=15.5, Year=70, Origin='American'),
 Row(MPG=16.0, Cylinders=8, Engine Disp=304.0, Horsepower=150, Weight=3433, Accelerate=12.0, Year=70, Origin='American'),
 Row(MPG=14.0, Cylinders=8, Engine Disp=455.0, Horsepower=225, Weight=3086, Accelerate=10.0, Year=70, Origin='American')]

# 3. Identify the label column and the input columns #

In [4]:
# Prepare feature vector to create Features
    # "Cylinders", "Engine Disp", "Horsepower", "Weight", "Accelerate", "Year" => Features
    # "MPG" => Target
assembler = VectorAssembler(inputCols = ["Cylinders", "Engine Disp", "Horsepower", "Weight", "Accelerate", "Year"]
                            , outputCol = "features")
mpg_transformed_data = assembler.transform(df_mpg)

In [5]:
mpg_transformed_data.select("features", "MPG").show()

+--------------------+----+
|            features| MPG|
+--------------------+----+
|[8.0,390.0,190.0,...|15.0|
|[6.0,199.0,90.0,2...|21.0|
|[6.0,199.0,97.0,2...|18.0|
|[8.0,304.0,150.0,...|16.0|
|[8.0,455.0,225.0,...|14.0|
|[8.0,350.0,165.0,...|15.0|
|[8.0,307.0,130.0,...|18.0|
|[8.0,454.0,220.0,...|14.0|
|[8.0,400.0,150.0,...|15.0|
|[8.0,307.0,200.0,...|10.0|
|[8.0,383.0,170.0,...|15.0|
|[8.0,318.0,210.0,...|11.0|
|[8.0,360.0,215.0,...|10.0|
|[8.0,429.0,198.0,...|15.0|
|[6.0,200.0,85.0,2...|21.0|
|[8.0,302.0,140.0,...|17.0|
|[8.0,304.0,193.0,...| 9.0|
|[8.0,340.0,160.0,...|14.0|
|[6.0,198.0,95.0,2...|22.0|
|[8.0,440.0,215.0,...|14.0|
+--------------------+----+
only showing top 20 rows



# 4. Split dataset #

In [6]:
# Split data into training and testing sets
(training_data, testing_data) = mpg_transformed_data.randomSplit([0.7, 0.3], seed=42)

# 5. Build and train Linear Regression model #

In [7]:
# Create instance of Linear Regression
lr = LinearRegression(featuresCol = "features", labelCol = "MPG")

# Train model
model = lr.fit(training_data)

24/02/15 01:21:21 WARN Instrumentation: [3b66a703] regParam is zero, which might cause numerical instability and overfitting.


# 6. Evaluate model # 

In [8]:
# Make predictions on testing data
predictions = model.transform(testing_data)

In [9]:
#Root Mean Squared Error (RMSE): RMSE is the square root of the average of the squared differences

evaluator = RegressionEvaluator(labelCol="MPG", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)

RMSE = 3.4531049690792326


In [10]:
#R-squared (R2): R2 is a statistical measure that represents the proportion of variance

evaluator = RegressionEvaluator(labelCol="MPG", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared =", r2)

R Squared = 0.8046190375720306


In [11]:
#Mean Absolute Error (MAE): MAE is the average of the absolute differences between the predicted and actual values

evaluator = RegressionEvaluator(labelCol="MPG", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("MAE =", mae)

MAE = 2.8423911791950265


# 7. Stop Spark Session #

In [12]:
spark.stop()