In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor


# Create a Spark session
spark = SparkSession.builder \
    .appName("Modeling") \
    .getOrCreate()


24/08/26 14:31:16 WARN Utils: Your hostname, MacBook-Pro-7.local resolves to a loopback address: 127.0.0.1; using 10.95.1.206 instead (on interface en0)
24/08/26 14:31:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 14:31:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/26 14:31:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/26 14:31:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/08/26 14:31:17 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/08/26 14:31:17 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/08/26 14:31:17 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting po

In [2]:
# Path to the CSV file
csv_file_path = "../data/curated/merged.csv"

# Read the CSV file
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)


                                                                                

+-----------+-----------+------------+-----+---+----+----+------+-------------+----------+-----------+--------+
|pickup_date|pickup_hour|pulocationid|count|wnd| tmp| dew|   atm|service_count|is_weekday|day_of_week| borough|
+-----------+-----------+------------+-----+---+----+----+------+-------------+----------+-----------+--------+
| 2023-11-13|         15|          26|  224|3.1| 6.7|-3.3|1027.2|           14|      true|          2|Brooklyn|
| 2023-11-12|          4|          29|   14|2.1| 5.6|-1.7|1030.9|           11|      true|          1|Brooklyn|
| 2023-11-09|          0|          29|   14|2.1| 7.8|-4.4|1018.9|           29|      true|          5|Brooklyn|
| 2023-11-12|          0|          29|   43|2.1| 7.8|-2.2|1029.7|            6|      true|          1|Brooklyn|
| 2023-11-10|         13|          29|   56|2.1|10.6| 0.0|1020.2|           12|     false|          6|Brooklyn|
+-----------+-----------+------------+-----+---+----+----+------+-------------+----------+-----------+--

## Linear Regression

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Convert categorical column to numerical if needed
if 'pulocationid_indexed' not in df.columns:
    indexer = StringIndexer(inputCol="pulocationid", outputCol="pulocationid_indexed")
    df = indexer.fit(df).transform(df)

# Define feature columns
feature_columns = ["pickup_hour", "pulocationid_indexed", "wnd", "tmp", "dew", "atm", "service_count"]

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Rename 'count' column to 'label'
df = df.withColumnRenamed("count", "label")

# List of columns to check for missing values
columns_to_check = ["pickup_hour", "pulocationid", "wnd", "tmp", "dew", "atm", "service_count", "label"]

# Check for missing values in relevant columns
for column in columns_to_check:
    if column in df.columns:
        missing_count = df.filter(col(column).isNull()).count()
        if missing_count > 0:
            print(f"Column {column} has {missing_count} missing values.")
    else:
        print(f"Column {column} does not exist in the DataFrame.")

# List to store results
results = []

# Perform linear regression for each borough
for borough in df.select("borough").distinct().rdd.flatMap(lambda x: x).collect():
    print(f"Processing borough: {borough}")
    
    # Filter DataFrame for the current borough
    df_borough = df.filter(df.borough == borough)
    
    # Split the data into training and test sets
    train_df, test_df = df_borough.randomSplit([0.8, 0.2], seed=42)
    
    # Initialize the Linear Regression model
    lr = LinearRegression(featuresCol="features", labelCol="label")
    
    # Train the model
    try:
        lr_model = lr.fit(train_df)
        
        # Make predictions
        predictions = lr_model.transform(test_df)
        
        # Evaluate the model
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        
        # Store results
        results.append((borough, rmse))
    except Exception as e:
        print(f"Error processing borough {borough}: {e}")

# Print results
for borough, rmse in results:
    print(f"Borough: {borough}, RMSE: {rmse}")


24/08/26 14:31:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/08/26 14:31:28 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Processing borough: Queens


24/08/26 14:31:29 WARN Instrumentation: [69d2236b] regParam is zero, which might cause numerical instability and overfitting.
24/08/26 14:31:30 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/26 14:31:30 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/08/26 14:31:31 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

Processing borough: EWR


24/08/26 14:31:33 WARN Instrumentation: [98ac58ce] regParam is zero, which might cause numerical instability and overfitting.
24/08/26 14:31:33 WARN Instrumentation: [98ac58ce] The standard deviation of the label is zero, so the coefficients will be zeros and the intercept will be the mean of the label; as a result, training is not needed.


Processing borough: Brooklyn


24/08/26 14:31:34 WARN Instrumentation: [a7fba458] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Staten Island


24/08/26 14:31:36 WARN Instrumentation: [7b2730af] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Manhattan


24/08/26 14:31:38 WARN Instrumentation: [f654f5cc] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Bronx


24/08/26 14:31:39 WARN Instrumentation: [3e8502e7] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: None
Error processing borough None: An error occurred while calling o872.fit.
: java.lang.AssertionError: assertion failed: Training dataset is empty.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.ml.optim.WeightedLeastSquares$Aggregator.validate(WeightedLeastSquares.scala:425)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:108)
	at org.apache.spark.ml.regression.LinearRegression.trainWithNormal(LinearRegression.scala:456)
	at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:354)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:329)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:186)
	at org

24/08/26 14:31:40 WARN Instrumentation: [be083044] regParam is zero, which might cause numerical instability and overfitting.
24/08/26 14:31:40 ERROR Instrumentation: java.lang.AssertionError: assertion failed: Training dataset is empty.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.ml.optim.WeightedLeastSquares$Aggregator.validate(WeightedLeastSquares.scala:425)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:108)
	at org.apache.spark.ml.regression.LinearRegression.trainWithNormal(LinearRegression.scala:456)
	at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:354)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:329)
	at org.apache.spark.ml.re

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, lit


# Assume df is already initialized
# Define the new DataFrame as lr_results
lr_results = df

# Convert categorical column to numerical if needed
if 'pulocationid_indexed' not in lr_results.columns:
    indexer = StringIndexer(inputCol="pulocationid", outputCol="pulocationid_indexed")
    lr_results = indexer.fit(lr_results).transform(lr_results)

# Define feature columns
feature_columns = ["pickup_hour", "pulocationid_indexed", "wnd", "tmp", "dew", "atm", "service_count"]

# Check if 'features' column already exists and drop it if necessary
if 'features' in lr_results.columns:
    lr_results = lr_results.drop("features")

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
lr_results = assembler.transform(lr_results)

# Rename 'count' column to 'label'
lr_results = lr_results.withColumnRenamed("count", "label")

# List of columns to check for missing values
columns_to_check = ["pickup_hour", "pulocationid", "wnd", "tmp", "dew", "atm", "service_count", "label"]

# Check for missing values in relevant columns
for column in columns_to_check:
    if column in lr_results.columns:
        missing_count = lr_results.filter(col(column).isNull()).count()
        if missing_count > 0:
            print(f"Column {column} has {missing_count} missing values.")
    else:
        print(f"Column {column} does not exist in the DataFrame.")

# List to store DataFrames with actual and predicted values
predictions_list = []

# Perform linear regression for each borough
for borough in lr_results.select("borough").distinct().rdd.flatMap(lambda x: x).collect():
    print(f"Processing borough: {borough}")
    
    # Filter DataFrame for the current borough
    df_borough = lr_results.filter(lr_results.borough == borough)
    
    # Split the data into training and test sets
    train_df, test_df = df_borough.randomSplit([0.8, 0.2], seed=42)
    
    # Initialize the Linear Regression model
    lr = LinearRegression(featuresCol="features", labelCol="label")
    
    # Train the model
    try:
        lr_model = lr.fit(train_df)
        
        # Make predictions
        predictions = lr_model.transform(test_df)
        
        # Evaluate the model
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        
        # Collect actual and predicted values
        predictions_df = predictions.select(col("pickup_date"), col("label").alias("actual"), col("prediction").alias("predicted"))
        
        # Add a column for the borough
        predictions_df = predictions_df.withColumn("borough", lit(borough))
        
        # Append to the list
        predictions_list.append(predictions_df)
        
        # Store RMSE results
        results.append((borough, rmse))
        
    except Exception as e:
        print(f"Error processing borough {borough}: {e}")

# Combine all the DataFrames in the list into one DataFrame
if predictions_list:
    lr_results = predictions_list[0]
    for df in predictions_list[1:]:
        lr_results = lr_results.union(df)

    # Show the combined DataFrame
    lr_results.show()
else:
    print("No predictions were made.")

# Print RMSE results
for borough, rmse in results:
    print(f"Borough: {borough}, RMSE: {rmse}")


Processing borough: Queens


24/08/26 14:32:08 WARN Instrumentation: [5c3199b9] regParam is zero, which might cause numerical instability and overfitting.
24/08/26 14:32:10 WARN Instrumentation: [35a24622] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: EWR


24/08/26 14:32:11 WARN Instrumentation: [35a24622] The standard deviation of the label is zero, so the coefficients will be zeros and the intercept will be the mean of the label; as a result, training is not needed.


Processing borough: Brooklyn


24/08/26 14:32:12 WARN Instrumentation: [aadcf887] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Staten Island


24/08/26 14:32:13 WARN Instrumentation: [bdeeb8af] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Manhattan


24/08/26 14:32:14 WARN Instrumentation: [945cd7a3] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Bronx


24/08/26 14:32:16 WARN Instrumentation: [5a2f0a69] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: None
Error processing borough None: An error occurred while calling o1772.fit.
: java.lang.AssertionError: assertion failed: Training dataset is empty.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.ml.optim.WeightedLeastSquares$Aggregator.validate(WeightedLeastSquares.scala:425)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:108)
	at org.apache.spark.ml.regression.LinearRegression.trainWithNormal(LinearRegression.scala:456)
	at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:354)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:329)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:186)
	at or

24/08/26 14:32:17 WARN Instrumentation: [a17fd765] regParam is zero, which might cause numerical instability and overfitting.
24/08/26 14:32:17 ERROR Instrumentation: java.lang.AssertionError: assertion failed: Training dataset is empty.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.ml.optim.WeightedLeastSquares$Aggregator.validate(WeightedLeastSquares.scala:425)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:108)
	at org.apache.spark.ml.regression.LinearRegression.trainWithNormal(LinearRegression.scala:456)
	at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:354)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:329)
	at org.apache.spark.ml.re

+-----------+------+------------------+-------+
|pickup_date|actual|         predicted|borough|
+-----------+------+------------------+-------+
| 2023-08-01|    23| 42.22920369072207| Queens|
| 2023-08-01|    25|  66.3013031269088| Queens|
| 2023-08-01|    33| 62.21764340112712| Queens|
| 2023-08-01|    37| 43.73370990548375| Queens|
| 2023-08-01|    20| 66.11517065502188| Queens|
| 2023-08-01|    21|  58.8075690404652| Queens|
| 2023-08-01|    58|  44.2088110489309| Queens|
| 2023-08-01|    50| 75.58851209967432| Queens|
| 2023-08-01|   102|47.547367060276294| Queens|
| 2023-08-01|    36|  83.0107278368014| Queens|
| 2023-08-01|     5|37.230753016196275| Queens|
| 2023-08-01|    38| 66.46115947442301| Queens|
| 2023-08-01|    10|30.997798697897917| Queens|
| 2023-08-01|    35| 50.72912793566094| Queens|
| 2023-08-01|    30| 72.22207386082766| Queens|
| 2023-08-01|    56| 89.36770795907557| Queens|
| 2023-08-01|    53| 92.03792317320514| Queens|
| 2023-08-01|    90| 99.73818781792677| 

In [13]:
# Get the coefficients and intercept
coefficients = lr_model.coefficients
intercept = lr_model.intercept

    # Print the coefficients and intercept
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

    # You can also print the coefficients for individual features if needed
feature_names = feature_columns
for feature, coef in zip(feature_names, coefficients):
    print(f"Feature: {feature}, Coefficient: {coef}")

Coefficients: [2.539705004182333,-0.3215634246083111,-0.06258068730401058,-0.625553071735154,0.32995540446423444,0.09363428064445994,-0.4618547604978454]
Intercept: 0.9504362842460911
Feature: pickup_hour, Coefficient: 2.539705004182333
Feature: pulocationid_indexed, Coefficient: -0.3215634246083111
Feature: wnd, Coefficient: -0.06258068730401058
Feature: tmp, Coefficient: -0.625553071735154
Feature: dew, Coefficient: 0.32995540446423444
Feature: atm, Coefficient: 0.09363428064445994
Feature: service_count, Coefficient: -0.4618547604978454


## Random Forest

In [9]:
# Path to the CSV file
csv_file_path = "../data/curated/merged.csv"

# Read the CSV file
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)

+-----------+-----------+------------+-----+---+----+----+------+-------------+----------+-----------+--------+
|pickup_date|pickup_hour|pulocationid|count|wnd| tmp| dew|   atm|service_count|is_weekday|day_of_week| borough|
+-----------+-----------+------------+-----+---+----+----+------+-------------+----------+-----------+--------+
| 2023-11-13|         15|          26|  224|3.1| 6.7|-3.3|1027.2|           14|      true|          2|Brooklyn|
| 2023-11-12|          4|          29|   14|2.1| 5.6|-1.7|1030.9|           11|      true|          1|Brooklyn|
| 2023-11-09|          0|          29|   14|2.1| 7.8|-4.4|1018.9|           29|      true|          5|Brooklyn|
| 2023-11-12|          0|          29|   43|2.1| 7.8|-2.2|1029.7|            6|      true|          1|Brooklyn|
| 2023-11-10|         13|          29|   56|2.1|10.6| 0.0|1020.2|           12|     false|          6|Brooklyn|
+-----------+-----------+------------+-----+---+----+----+------+-------------+----------+-----------+--

In [10]:
from pyspark.ml.feature import StringIndexer

# Create `pulocationid_indexed` if it does not exist
if 'pulocationid_indexed' not in df.columns:
    indexer = StringIndexer(inputCol="pulocationid", outputCol="pulocationid_indexed")
    df = indexer.fit(df).transform(df)

# Verify the new column
df.printSchema()
df.select("pulocationid", "pulocationid_indexed").show(5)


root
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- wnd: double (nullable = true)
 |-- tmp: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- atm: double (nullable = true)
 |-- service_count: integer (nullable = true)
 |-- is_weekday: boolean (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- borough: string (nullable = true)
 |-- pulocationid_indexed: double (nullable = false)

+------------+--------------------+
|pulocationid|pulocationid_indexed|
+------------+--------------------+
|          26|               131.0|
|          29|               137.0|
|          29|               137.0|
|          29|               137.0|
|          29|               137.0|
+------------+--------------------+
only showing top 5 rows



In [11]:
from pyspark.ml.feature import VectorAssembler

# Define the feature columns
feature_columns = ["pickup_hour", "pulocationid_indexed", "wnd", "tmp", "dew", "atm", "service_count"]

# Create the VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform the DataFrame to include the 'features' column
df = assembler.transform(df)

# Verify the addition of the 'features' column
df.printSchema()
df.select("features").show(5, truncate=False)



root
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- wnd: double (nullable = true)
 |-- tmp: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- atm: double (nullable = true)
 |-- service_count: integer (nullable = true)
 |-- is_weekday: boolean (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- borough: string (nullable = true)
 |-- pulocationid_indexed: double (nullable = false)
 |-- features: vector (nullable = true)

+-------------------------------------+
|features                             |
+-------------------------------------+
|[15.0,131.0,3.1,6.7,-3.3,1027.2,14.0]|
|[4.0,137.0,2.1,5.6,-1.7,1030.9,11.0] |
|[0.0,137.0,2.1,7.8,-4.4,1018.9,29.0] |
|[0.0,137.0,2.1,7.8,-2.2,1029.7,6.0]  |
|[13.0,137.0,2.1,10.6,0.0,1020.2,12.0]|
+-------------------------------------+
only showing top 5 rows



In [12]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, lit

# Rename the DataFrame to rf_df
rf_df = df

# Rename 'count' column to 'label'
rf_df = rf_df.withColumnRenamed("count", "label")

# List to store results
results_rf = []

# List to store DataFrames with actual and predicted values
predictions_list_rf = []

# Perform Random Forest Regression for each borough
for borough in rf_df.select("borough").distinct().rdd.flatMap(lambda x: x).collect():
    print(f"Processing borough: {borough}")

    # Filter DataFrame for the current borough
    df_borough = rf_df.filter(rf_df.borough == borough)

    # Check if 'features' and 'label' columns exist
    if 'features' not in df_borough.columns or 'label' not in df_borough.columns:
        print(f"Skipping borough {borough}: required columns are missing.")
        continue

    # Check if DataFrame is empty
    if df_borough.count() == 0:
        print(f"Skipping borough {borough}: DataFrame is empty.")
        continue

    # Split the data into training and test sets
    train_df, test_df = df_borough.randomSplit([0.8, 0.2], seed=42)

    # Initialize the Random Forest model with increased maxBins
    rf = RandomForestRegressor(featuresCol="features", labelCol="label", maxBins=500)

    # Train the model
    try:
        rf_model = rf.fit(train_df)

        # Make predictions
        predictions = rf_model.transform(test_df)

        # Evaluate the model
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)

        # Collect actual and predicted values
        predictions_df = predictions.select(col("pickup_date"), col("label").alias("actual"), col("prediction").alias("predicted"))
        
        # Add a column for the borough
        predictions_df = predictions_df.withColumn("borough", lit(borough))
        
        # Append to the list
        predictions_list_rf.append(predictions_df)
        
        # Store RMSE results
        results_rf.append((borough, rmse))
        
    except Exception as e:
        print(f"Error processing borough {borough}: {e}")

# Combine all the DataFrames in the list into one DataFrame
if predictions_list_rf:
    rf_results = predictions_list_rf[0]
    for df in predictions_list_rf[1:]:
        rf_results = rf_results.union(df)

    # Show the combined DataFrame
    rf_results.show()
else:
    print("No predictions were made.")

# Print RMSE results
for borough, rmse in results_rf:
    print(f"Borough: {borough}, RMSE: {rmse}")


Processing borough: Queens


                                                                                

Processing borough: EWR


24/08/26 14:33:40 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 500 to 3 (= number of training instances)
24/08/26 14:33:40 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 3) to be at least as large as the number of values in each categorical feature, but categorical feature 1 has 261 values. Consider removing this and other categorical features with a large number of values, or add more training examples.
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:151)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:274)
	at org.apache.spark.ml.regression.RandomForestRegressor.$anonfun$train$1(RandomForestRegressor.scala:158)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentatio

Error processing borough EWR: requirement failed: DecisionTree requires maxBins (= 3) to be at least as large as the number of values in each categorical feature, but categorical feature 1 has 261 values. Consider removing this and other categorical features with a large number of values, or add more training examples.
Processing borough: Brooklyn


                                                                                

Processing borough: Staten Island
Processing borough: Manhattan
Processing borough: Bronx
Processing borough: None
Skipping borough None: DataFrame is empty.
+-----------+------+------------------+-------+
|pickup_date|actual|         predicted|borough|
+-----------+------+------------------+-------+
| 2023-08-01|    23|  29.2140696770817| Queens|
| 2023-08-01|    25|29.487197402552585| Queens|
| 2023-08-01|    33|  64.4857960938257| Queens|
| 2023-08-01|    37| 40.16704303762327| Queens|
| 2023-08-01|    20| 55.59631751206391| Queens|
| 2023-08-01|    21| 66.53754740973802| Queens|
| 2023-08-01|    58|135.38097865829357| Queens|
| 2023-08-01|    50|109.59361367533113| Queens|
| 2023-08-01|   102|134.63425582230235| Queens|
| 2023-08-01|    36|33.607346881547436| Queens|
| 2023-08-01|     5|23.680657173408104| Queens|
| 2023-08-01|    38|113.54279190135932| Queens|
| 2023-08-01|    10|23.680657173408104| Queens|
| 2023-08-01|    35|32.134984630498586| Queens|
| 2023-08-01|    30|26.083