In [23]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor


# Create a Spark session
spark = SparkSession.builder \
    .appName("Modeling") \
    .getOrCreate()

24/08/26 07:49:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/26 07:49:31 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [24]:
# Path to the CSV file
csv_file_path = "../data/curated/merged2.csv"

# Read the CSV file
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)

+----------+----+-----+---+----+----+---------+------------+----------+-----------+
|      date|hour|count|wnd| tmp| dew|bus_count|subway_count|is_weekday|day_of_week|
+----------+----+-----+---+----+----+---------+------------+----------+-----------+
|2023-07-01|   0|33809|0.0|23.9|13.3|        1|           5|     false|          7|
|2023-07-01|   1|26914|0.0|23.3|13.3|        1|           7|     false|          7|
|2023-07-01|   2|21115|0.0|23.3|12.8|        2|           2|     false|          7|
|2023-07-01|   3|17051|3.1|22.8|12.8|        0|           1|     false|          7|
|2023-07-01|   4|14159|1.5|22.8|11.7|        0|          11|     false|          7|
+----------+----+-----+---+----+----+---------+------------+----------+-----------+
only showing top 5 rows



In [25]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

# Assemble the features into a single vector column named "features"
assembler = VectorAssembler(
    inputCols=["hour", "wnd", "tmp", "bus_count", "subway_count","is_weekday", "day_of_week"],
    outputCol="features"
)

# Define the Lasso regression model
lasso = LinearRegression(featuresCol="features", labelCol="count", elasticNetParam=1.0)

# Create a pipeline with the assembler and the Lasso model
pipeline = Pipeline(stages=[assembler, lasso])

# Fit the model to the data
model = pipeline.fit(df)

# Make predictions on the same DataFrame
predictions = model.transform(df)

# Show the predictions alongside the original data
predictions.select("date", "hour", "count", "prediction").show()

# Optionally, view the model coefficients and intercept
linear_model = model.stages[-1]  # The last stage is the linear regression model
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))


24/08/26 07:49:37 WARN Instrumentation: [81ca52af] regParam is zero, which might cause numerical instability and overfitting.


+----------+----+-----+------------------+
|      date|hour|count|        prediction|
+----------+----+-----+------------------+
|2023-07-01|   0|33809|17069.311853624724|
|2023-07-01|   1|26914|17873.481476657846|
|2023-07-01|   2|21115| 19698.05298052082|
|2023-07-01|   3|17051|20960.259688865117|
|2023-07-01|   4|14159|20509.410333931002|
|2023-07-01|   5|11827|21793.663967435095|
|2023-07-01|   6|13210| 23741.98332639597|
|2023-07-01|   7|15708|24245.959629329038|
|2023-07-01|   8|19051| 26316.11233073243|
|2023-07-01|   9|22786|26625.430049987554|
|2023-07-01|  10|24856|27055.266163720684|
|2023-07-01|  11|27103| 28092.24309857514|
|2023-07-01|  12|28176|  29811.1973865615|
|2023-07-01|  13|28834| 30342.02369699465|
|2023-07-01|  14|30289|31188.321943614326|
|2023-07-01|  15|31437| 32633.86798500876|
|2023-07-01|  16|31861| 33484.85818479698|
|2023-07-01|  17|33489| 34598.93521436062|
|2023-07-01|  18|35206| 36026.73445585126|
|2023-07-01|  19|35360|36772.491279080365|
+----------

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate the model using different metrics
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction")

# Mean Absolute Error (MAE)
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
print(f"Mean Absolute Error (MAE): {mae}")

# Mean Squared Error (MSE)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
print(f"Mean Squared Error (MSE): {mse}")

# Root Mean Squared Error (RMSE)
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
print(f"Root Mean Squared Error (RMSE): {rmse}")

# R-squared (R²)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 5588.473711190105
Mean Squared Error (MSE): 53945480.08331003
Root Mean Squared Error (RMSE): 7344.7586810806815
R-squared (R²): 0.5141583721799545


In [20]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Assemble the features into a single vector
feature_columns = ['wnd', 'tmp', 'dew', 'atm', 'bus_count', 'subway_count', 'is_weekday', 'day_of_week']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_with_features = assembler.transform(df)

# Initialize Lasso linear regression model
lasso = LinearRegression(labelCol='count', featuresCol='features', regParam=0.1, elasticNetParam=1.0)

# Fit the model
lasso_model = lasso.fit(df_with_features)

# Make predictions
predictions = lasso_model.transform(df_with_features)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='count', predictionCol='prediction', metricName='r2')
r2 = evaluator.evaluate(predictions)
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 7863.158818957534
Mean Squared Error (MSE): 94467597.39571145
Root Mean Squared Error (RMSE): 9719.444294593774
R-squared (R²): 0.14920969793758843


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



# Feature Engineering
# Assemble features into a vector
feature_columns = ['wnd', 'tmp', 'dew', 'atm', 'bus_count', 'subway_count', 'is_weekday', 'day_of_week']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_with_features = assembler.transform(df)

# Optional: Standardize features
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=True)
scaler_model = scaler.fit(df_with_features)
df_scaled = scaler_model.transform(df_with_features)

# Define and train Lasso Linear Regression model with hyperparameter tuning
lasso = LinearRegression(labelCol='count', featuresCol='scaled_features', elasticNetParam=1.0)  # Lasso with elasticNetParam=1.0

# Create a parameter grid for tuning
paramGrid = ParamGridBuilder().addGrid(lasso.regParam, [0.01, 0.1, 1.0]).build()

# Create a CrossValidator
crossval = CrossValidator(estimator=lasso,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(labelCol='count', predictionCol='prediction'),
                           numFolds=5)

# Run cross-validation, and choose the best set of parameters
cv_model = crossval.fit(df_scaled)
cv_predictions = cv_model.transform(df_scaled)

# Evaluate the Lasso model
cv_evaluator = RegressionEvaluator(labelCol='count', predictionCol='prediction', metricName='r2')
r2 = cv_evaluator.evaluate(cv_predictions)
mae = cv_evaluator.evaluate(cv_predictions, {cv_evaluator.metricName: "mae"})
mse = cv_evaluator.evaluate(cv_predictions, {cv_evaluator.metricName: "mse"})
rmse = cv_evaluator.evaluate(cv_predictions, {cv_evaluator.metricName: "rmse"})

print(f"Lasso Mean Absolute Error (MAE): {mae}")
print(f"Lasso Mean Squared Error (MSE): {mse}")
print(f"Lasso Root Mean Squared Error (RMSE): {rmse}")
print(f"Lasso R-squared (R²): {r2}")

# Alternative Model: Random Forest Regressor
rf = RandomForestRegressor(labelCol='count', featuresCol='scaled_features')

# Train Random Forest model
rf_model = rf.fit(df_scaled)
rf_predictions = rf_model.transform(df_scaled)

# Evaluate Random Forest model
rf_evaluator = RegressionEvaluator(labelCol='count', predictionCol='prediction', metricName='r2')
rf_r2 = rf_evaluator.evaluate(rf_predictions)
rf_mae = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "mae"})
rf_mse = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "mse"})
rf_rmse = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "rmse"})

print(f"Random Forest Mean Absolute Error (MAE): {rf_mae}")
print(f"Random Forest Mean Squared Error (MSE): {rf_mse}")
print(f"Random Forest Root Mean Squared Error (RMSE): {rf_rmse}")
print(f"Random Forest R-squared (R²): {rf_r2}")

# Stop Spark session
spark.stop()


Lasso Mean Absolute Error (MAE): 7863.145888526261
Lasso Mean Squared Error (MSE): 94467597.03789735
Lasso Root Mean Squared Error (RMSE): 9719.444276186645
Lasso R-squared (R²): 0.14920970116011956
Random Forest Mean Absolute Error (MAE): 7454.456779797125
Random Forest Mean Squared Error (MSE): 85782247.53110453
Random Forest Root Mean Squared Error (RMSE): 9261.870628069933
Random Forest R-squared (R²): 0.22743134894320727
