In [20]:
from pyspark.sql import SparkSession

# Path to your CSV file
csv_file_path = "/Users/pintoza/Desktop/dev/data-science/taxi-demand-forecast/data/interim/weekly_zone_aggregates.csv"

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Taxi Data Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.option("header", "true").csv(csv_file_path)

# Show the first few rows of the DataFrame to confirm loading
df.show()

+----+----+------------+----------------+------------------+------------------+
|year|week|PULocationID|total_passengers|    total_distance|      total_amount|
+----+----+------------+----------------+------------------+------------------+
|2018|   1|           1|             296|            335.08|16448.950000000015|
|2018|   1|           2|               1|               7.2|              23.3|
|2018|   1|           3|              22|             99.02|349.44000000000005|
|2018|   1|           4|            7763|11798.969999999996| 65709.18999999891|
|2018|   1|           5|               2|                 0|            120.96|
|2018|   1|           6|               8| 4.779999999999999|            156.02|
|2018|   1|           7|            6632|10860.950000000006| 54907.89999999894|
|2018|   1|           8|              42|159.67000000000004|1168.2599999999998|
|2018|   1|           9|              27|            105.83|             414.2|
|2018|   1|          10|             776

In [21]:
from pyspark.sql import functions as F

# Step 1: Create a DataFrame of all year-week combinations
year_week_df = df.select("year", "week").distinct()

# Step 2: Create a DataFrame of all PULocationIDs
locations_df = spark.range(1, 266).withColumnRenamed("id", "PULocationID")  # Assuming IDs 1 through 265

# Step 3: Cross join to get all possible combinations
all_combinations_df = year_week_df.crossJoin(locations_df)

# Step 4: Left join with your original data
combined_df = all_combinations_df.join(df, ["year", "week", "PULocationID"], "left_outer")

# Step 5: Fill missing values with zeros
final_df = combined_df.fillna({"total_passengers": 0, "total_distance": 0, "total_amount": 0})

# Show the result
final_df.show()

+----+----+------------+----------------+------------------+------------------+
|year|week|PULocationID|total_passengers|    total_distance|      total_amount|
+----+----+------------+----------------+------------------+------------------+
|2019|  49|           1|             321|441.77000000000004|22832.130000000012|
|2019|  49|           2|               2|              20.7|              78.6|
|2019|  49|           3|              28| 363.5399999999999|1663.6400000000003|
|2019|  49|           4|            4372| 7507.689999999993|51820.319999999556|
|2019|  49|           5|               8|170.67000000000002|             516.9|
|2019|  49|           6|               1|              14.5|              39.8|
|2019|  49|           7|            2690|           5166.63|29075.559999999925|
|2019|  49|           8|              21|             59.82|            853.11|
|2019|  49|           9|              10|124.35999999999999|            723.96|
|2019|  49|          10|             898

In [22]:
# Count the number of rows in the DataFrame
row_count = final_df.count()
print(f"Row count: {row_count}")

Row count: 69430


In [23]:
from pyspark.sql import functions as F

# Group by year, week, and PULocationID and count each combination
unique_combinations_df = final_df.groupBy("year", "week", "PULocationID").agg(F.count("*").alias("count"))

# Check for any combination that occurs more than once
duplicate_combinations_count = unique_combinations_df.filter("count > 1").count()

print(f"Duplicate combinations count: {duplicate_combinations_count}")

Duplicate combinations count: 0


In [24]:
# Find number of missing values in each column
final_df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in final_df.columns]).show()

+----+----+------------+----------------+--------------+------------+
|year|week|PULocationID|total_passengers|total_distance|total_amount|
+----+----+------------+----------------+--------------+------------+
|   0|   0|           0|               0|             0|           0|
+----+----+------------+----------------+--------------+------------+


In [25]:
final_df.dtypes

[('year', 'string'),
 ('week', 'string'),
 ('PULocationID', 'bigint'),
 ('total_passengers', 'string'),
 ('total_distance', 'string'),
 ('total_amount', 'string')]

In [29]:
# Convert to pandas DataFrame (note: be mindful of memory limitations)
pandas_df = final_df.toPandas()

# Save to CSV using pandas
pandas_df.to_csv("/Users/pintoza/Desktop/dev/data-science/taxi-demand-forecast/data/processed/weekly_zone_aggregates.csv", index=False)