In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql.functions import col, lit

# Create a Spark session
spark = SparkSession.builder \
    .appName("Modeling") \
    .getOrCreate()


24/08/26 20:24:16 WARN Utils: Your hostname, MacBook-Pro-7.local resolves to a loopback address: 127.0.0.1; using 10.95.1.206 instead (on interface en0)
24/08/26 20:24:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 20:24:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/26 20:24:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/26 20:24:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/08/26 20:24:17 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 63987)
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.4/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/homebrew/Cellar/python@3.12/3.12.4/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
  File "/opt/homebrew/Cellar/python@3.12/3.12.4/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/homebrew/Cellar/python@3.12/3.12.4/Frameworks/Python.framework/Versions/3.12/lib/python3.12/socketserver.py", line 761, in __init__
    self.handle()
  File "/Users/umabarnes/Desktop/UNI/project-1-individual-umabarn

In [8]:
# Path to the CSV file
csv_file_path = "../data/curated/final_merged.csv"

# Read the CSV file
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)


[Stage 139:>                                                        (0 + 8) / 8]

+------------+-----------+-----------+-----+---------+---+----+----+----------+----+---------+------------+-----------+----------+
|pulocationid|pickup_date|pickup_hour|count|  borough|wnd| tmp| dew|      date|hour|bus_count|subway_count|day_of_week|is_weekday|
+------------+-----------+-----------+-----+---------+---+----+----+----------+----+---------+------------+-----------+----------+
|         243| 2023-07-03|         18|  129|Manhattan|0.0|30.6|20.0|2023-07-03|  18|        4|           5|          1|         1|
|         243| 2023-07-05|         11|  101|Manhattan|1.5|25.0|19.4|2023-07-05|  11|        5|          11|          3|         1|
|         243| 2023-07-05|          1|  128|Manhattan|0.0|25.0|21.7|2023-07-05|   1|        4|           0|          3|         1|
|         243| 2023-07-04|          5|   54|Manhattan|0.0|23.3|22.2|2023-07-04|   5|        0|           1|          2|         1|
|         243| 2023-07-07|         18|  204|Manhattan|0.0|30.0|21.1|2023-07-07|  18

                                                                                

## Linear Regression

In [9]:
# convert categorical column to numerical
if 'pulocationid_indexed' not in df.columns:
    indexer = StringIndexer(inputCol="pulocationid", outputCol="pulocationid_indexed")
    df = indexer.fit(df).transform(df)

# define feature columns
feature_columns = ["pickup_hour", "pulocationid_indexed", "wnd", "tmp", "dew", "subway_count", "bus_count", "day_of_week"]

# assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# rename 'count' column to 'label'
df = df.withColumnRenamed("count", "label")

# list of columns to check for missing values
columns_to_check = ["pickup_hour", "pulocationid", "wnd", "tmp", "dew", "bus_count", "subway_count", "label", "day_of_week"]

# check for missing values
for column in columns_to_check:
    if column in df.columns:
        missing_count = df.filter(col(column).isNull()).count()
        if missing_count > 0:
            print(f"Column {column} has {missing_count} missing values.")
    else:
        print(f"Column {column} does not exist in the DataFrame.")

# list to store results
results = []

# perform linear regression for each borough
for borough in df.select("borough").distinct().rdd.flatMap(lambda x: x).collect():
    print(f"Processing borough: {borough}")
    
    # filter DataFrame for the current borough
    df_borough = df.filter(df.borough == borough)
    
    # split the data into training and test sets
    train_df, test_df = df_borough.randomSplit([0.8, 0.2], seed=42)
    
    # initialize the Linear Regression model
    lr = LinearRegression(featuresCol="features", labelCol="label")
    
    # train the model
    try:
        lr_model = lr.fit(train_df)
        
        # make predictions
        predictions = lr_model.transform(test_df)
        
        # evaluate the model
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        
        # extract the coefficients and intercept
        coefficients = lr_model.coefficients
        intercept = lr_model.intercept
        
        # store results
        results.append((borough, rmse, coefficients, intercept))
    except Exception as e:
        print(f"Error processing borough {borough}: {e}")

# print results
for borough, rmse, coefficients, intercept in results:
    print(f"Borough: {borough}, RMSE: {rmse}")
    print(f"Coefficients: {coefficients}")
    print(f"Intercept: {intercept}\n")


Processing borough: Queens


24/08/26 20:28:10 WARN Instrumentation: [d1e94131] regParam is zero, which might cause numerical instability and overfitting.
24/08/26 20:28:16 WARN Instrumentation: [25806a53] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Brooklyn


24/08/26 20:28:20 WARN Instrumentation: [a8aec9db] regParam is zero, which might cause numerical instability and overfitting.


Processing borough: Staten Island


                                                                                

Processing borough: Manhattan


24/08/26 20:28:24 WARN Instrumentation: [073d4f68] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

Processing borough: Bronx


24/08/26 20:28:26 WARN Instrumentation: [f06f71ca] regParam is zero, which might cause numerical instability and overfitting.


Borough: Queens, RMSE: 111.53662722588795
Coefficients: [2.908524244246942,-0.2501988723085781,0.5474573159817555,-0.5376337293956361,0.3559756575909106,-0.5602642802101562,0.3695690771221444,2.4030629586511782]
Intercept: 76.10226703669996

Borough: Brooklyn, RMSE: 102.34499011941666
Coefficients: [4.01858437682363,-0.25443805268081987,0.3918894809846149,-0.09786662965777526,-0.058129974943484986,-0.503473878011684,0.9565834220206447,7.254722939769958]
Intercept: 72.31134692786733

Borough: Staten Island, RMSE: 13.225604968015675
Coefficients: [0.6545391558088874,-0.1789343146504663,0.04539907583503766,-0.19269187045687455,0.09950344794396876,-0.14821732622515382,0.04953457332977601,0.19866690451210733]
Intercept: 49.34710755459661

Borough: Manhattan, RMSE: 124.77488070532574
Coefficients: [6.671202111638213,-0.42863814585533483,0.5909140998602761,0.5584497632602288,-0.7652861735046158,0.6396129243022665,2.9845865357699424,7.441290404280645]
Intercept: 84.29710540489812

Borough: Bro

In [10]:
# Get the coefficients and intercept
coefficients = lr_model.coefficients
intercept = lr_model.intercept

    # Print the coefficients and intercept
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

    # You can also print the coefficients for individual features if needed
feature_names = feature_columns
for feature, coef in zip(feature_names, coefficients):
    print(f"Feature: {feature}, Coefficient: {coef}")

Coefficients: [2.41472921463413,-0.32061243578503235,-0.11591507876299345,-0.6067426019734801,0.26540715538507387,-0.6336562691304639,0.0343052828197857,2.036580182243971]
Intercept: 87.55329821214609
Feature: pickup_hour, Coefficient: 2.41472921463413
Feature: pulocationid_indexed, Coefficient: -0.32061243578503235
Feature: wnd, Coefficient: -0.11591507876299345
Feature: tmp, Coefficient: -0.6067426019734801
Feature: dew, Coefficient: 0.26540715538507387
Feature: subway_count, Coefficient: -0.6336562691304639
Feature: bus_count, Coefficient: 0.0343052828197857
Feature: day_of_week, Coefficient: 2.036580182243971


## Random Forest

In [11]:
# Path to the CSV file
csv_file_path = "../data/curated/final_merged.csv"

# Read the CSV file
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show(5)

+------------+-----------+-----------+-----+---------+---+----+----+----------+----+---------+------------+-----------+----------+
|pulocationid|pickup_date|pickup_hour|count|  borough|wnd| tmp| dew|      date|hour|bus_count|subway_count|day_of_week|is_weekday|
+------------+-----------+-----------+-----+---------+---+----+----+----------+----+---------+------------+-----------+----------+
|         243| 2023-07-03|         18|  129|Manhattan|0.0|30.6|20.0|2023-07-03|  18|        4|           5|          1|         1|
|         243| 2023-07-05|         11|  101|Manhattan|1.5|25.0|19.4|2023-07-05|  11|        5|          11|          3|         1|
|         243| 2023-07-05|          1|  128|Manhattan|0.0|25.0|21.7|2023-07-05|   1|        4|           0|          3|         1|
|         243| 2023-07-04|          5|   54|Manhattan|0.0|23.3|22.2|2023-07-04|   5|        0|           1|          2|         1|
|         243| 2023-07-07|         18|  204|Manhattan|0.0|30.0|21.1|2023-07-07|  18

                                                                                

In [12]:

# create `pulocationid_indexed` if it does not exist
if 'pulocationid_indexed' not in df.columns:
    indexer = StringIndexer(inputCol="pulocationid", outputCol="pulocationid_indexed")
    df = indexer.fit(df).transform(df)

# verify the new column
df.printSchema()
df.select("pulocationid", "pulocationid_indexed").show(5)


root
 |-- pulocationid: integer (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- borough: string (nullable = true)
 |-- wnd: double (nullable = true)
 |-- tmp: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- bus_count: integer (nullable = true)
 |-- subway_count: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- is_weekday: integer (nullable = true)
 |-- pulocationid_indexed: double (nullable = false)

+------------+--------------------+
|pulocationid|pulocationid_indexed|
+------------+--------------------+
|         243|               116.0|
|         243|               116.0|
|         243|               116.0|
|         243|               116.0|
|         243|               116.0|
+------------+--------------------+
only showing top 5 rows



In [13]:

# define the feature columns
feature_columns = ["pickup_hour", "pulocationid_indexed", "wnd", "tmp", "dew", "bus_count", "subway_count", "day_of_week"]

# create VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# transform the df to include the 'features' column
df = assembler.transform(df)

# check column
df.printSchema()
df.select("features").show(5, truncate=False)



root
 |-- pulocationid: integer (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- borough: string (nullable = true)
 |-- wnd: double (nullable = true)
 |-- tmp: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- bus_count: integer (nullable = true)
 |-- subway_count: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- is_weekday: integer (nullable = true)
 |-- pulocationid_indexed: double (nullable = false)
 |-- features: vector (nullable = true)

+---------------------------------------+
|features                               |
+---------------------------------------+
|[18.0,116.0,0.0,30.6,20.0,4.0,5.0,1.0] |
|[11.0,116.0,1.5,25.0,19.4,5.0,11.0,3.0]|
|[1.0,116.0,0.0,25.0,21.7,4.0,0.0,3.0]  |
|[5.0,116.0,0.0,23.3,22.2,0.0,1.0,2.0]  |
|[18.0,116.0,0.0,30.0,21.1,4.0,3.0,5.0] |
+-------

In [14]:
# rename df to rf_df
rf_df = df

# rename 'count' column to 'label'
rf_df = rf_df.withColumnRenamed("count", "label")

# list to store results
results_rf = []

# list to store DataFrames with actual and predicted values
predictions_list_rf = []

# perform RF regression for each borough
for borough in rf_df.select("borough").distinct().rdd.flatMap(lambda x: x).collect():
    print(f"Processing borough: {borough}")

    # filter df for the current borough
    df_borough = rf_df.filter(rf_df.borough == borough)

    # check if 'features' and 'label' columns exist
    if 'features' not in df_borough.columns or 'label' not in df_borough.columns:
        print(f"Skipping borough {borough}: required columns are missing.")
        continue

    # check if df is empty
    if df_borough.count() == 0:
        print(f"Skipping borough {borough}: DataFrame is empty.")
        continue

    # split data into training and test sets
    train_df, test_df = df_borough.randomSplit([0.8, 0.2], seed=42)

    # initialize RF model with increased maxBins
    rf = RandomForestRegressor(featuresCol="features", labelCol="label", maxBins=500)

    # train model
    try:
        rf_model = rf.fit(train_df)

        # make predictions
        predictions = rf_model.transform(test_df)

        # evaluate model
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)

        # collect actual and predicted values
        predictions_df = predictions.select(col("pickup_date"), col("label").alias("actual"), col("prediction").alias("predicted"))
        
        # add a column for the borough
        predictions_df = predictions_df.withColumn("borough", lit(borough))
        
        # append to the list
        predictions_list_rf.append(predictions_df)
        
        # store RMSE results
        results_rf.append((borough, rmse))
        
    except Exception as e:
        print(f"Error processing borough {borough}: {e}")

# combine all df's in the list into one DataFrame
if predictions_list_rf:
    rf_results = predictions_list_rf[0]
    for df in predictions_list_rf[1:]:
        rf_results = rf_results.union(df)

    # show combined DataFrame
    rf_results.show()
else:
    print("No predictions were made.")

# print RMSE results
for borough, rmse in results_rf:
    print(f"Borough: {borough}, RMSE: {rmse}")


Processing borough: Queens


                                                                                

Processing borough: Brooklyn


                                                                                

Processing borough: Staten Island
Processing borough: Manhattan


                                                                                

Processing borough: Bronx


                                                                                

+-----------+------+------------------+-------+
|pickup_date|actual|         predicted|borough|
+-----------+------+------------------+-------+
| 2023-07-01|    84|  89.1592512778675| Queens|
| 2023-07-01|   112| 119.9734641373007| Queens|
| 2023-07-01|   123|165.04598611976076| Queens|
| 2023-07-01|   191|177.92128367376816| Queens|
| 2023-07-01|   197| 172.2768286109242| Queens|
| 2023-07-01|   199|155.67269804964403| Queens|
| 2023-07-02|    61|107.71012880707761| Queens|
| 2023-07-02|   157|175.34524150384104| Queens|
| 2023-07-02|   124|162.49339963869343| Queens|
| 2023-07-02|   142|162.60896675104397| Queens|
| 2023-07-02|   111| 142.3244661504924| Queens|
| 2023-07-03|    62| 72.36372730126955| Queens|
| 2023-07-03|    32| 72.78774300693553| Queens|
| 2023-07-03|   117|143.16267990638187| Queens|
| 2023-07-03|   169|149.76184234525056| Queens|
| 2023-07-03|   160|118.38475108486143| Queens|
| 2023-07-04|    52| 71.98102949135492| Queens|
| 2023-07-05|    66| 68.43828557307725| 

24/08/26 21:53:28 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1080071 ms exceeds timeout 120000 ms
24/08/26 21:53:28 WARN SparkContext: Killing executors is not supported by current scheduler.
24/08/26 21:53:31 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$