### PREPROCESSING 

### Download files

In [5]:
# importing library for downloading data
import os
from urllib.request import urlretrieve

In [6]:
os.getcwd()

'/Users/tasneemzulaiqa/Documents/GitHub/project-1-individual-tasneemzulaiqa/code'

In [7]:
import os

output_relative_dir = 'data/'

# check if it file exists as makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# for each type of data set we will need, we will create the paths
for target_dir in ('tlc_data', 'taxi_zones'): 
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

In [8]:
os.getcwd()

'/Users/tasneemzulaiqa/Documents/GitHub/project-1-individual-tasneemzulaiqa/code'

In [9]:
vehicles_dir = 'data/tlc_data/'

for target_dir in ('yellow','hvfhv'): 
    if not os.path.exists(vehicles_dir + target_dir):
        os.makedirs(vehicles_dir + target_dir);

In [10]:
# year 2024 data
YEAR = "2024"
MONTHS = range(7,13)

In [11]:
# URL TEMPLATES
yellow_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"
hvfhv_url =  "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"

In [13]:
tlc_output_dir = 'data/tlc_data/'

for file_dir, base_url in zip(
    ('yellow', 'hvfhv'),
    (yellow_url, hvfhv_url)
):
    final_path = f"{tlc_output_dir}{file_dir}"
    
    for month in MONTHS:
        month_str = str(month).zfill(2)  # zero pad month
        
        print(f"Begin month {month_str} for {file_dir}")
        
        current_url = f"{base_url}{YEAR}-{month_str}.parquet"
        output_dir = f"{final_path}/{YEAR}-{month_str}.parquet"
        
        # Download file
        urlretrieve(current_url, output_dir)
        
        print(f"Completed month {month_str} for {file_dir}")

Begin month 07 for yellow
Completed month 07 for yellow
Begin month 08 for yellow
Completed month 08 for yellow
Begin month 09 for yellow
Completed month 09 for yellow
Begin month 10 for yellow
Completed month 10 for yellow
Begin month 11 for yellow
Completed month 11 for yellow
Begin month 12 for yellow
Completed month 12 for yellow
Begin month 07 for hvfhv
Completed month 07 for hvfhv
Begin month 08 for hvfhv
Completed month 08 for hvfhv
Begin month 09 for hvfhv
Completed month 09 for hvfhv
Begin month 10 for hvfhv
Completed month 10 for hvfhv
Begin month 11 for hvfhv
Completed month 11 for hvfhv
Begin month 12 for hvfhv
Completed month 12 for hvfhv


### Initial fixes

In [16]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("TLC eda")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

### Yellow

In [17]:
from pyspark.sql import functions as F

In [19]:
import os
os.getcwd()

'/Users/tasneemzulaiqa/Documents/GitHub/project-1-individual-tasneemzulaiqa/code'

In [20]:
# load yellow taxis (july)

sdf_yellow_example = spark.read.parquet('../data/tlc_data/yellow/2024-07.parquet')
sdf_yellow_example.show(1, vertical=True, truncate=100)

                                                                                

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2024-07-01 00:34:56 
 tpep_dropoff_datetime | 2024-07-01 00:46:49 
 passenger_count       | 1                   
 trip_distance         | 3.2                 
 RatecodeID            | 1                   
 store_and_fwd_flag    | N                   
 PULocationID          | 140                 
 DOLocationID          | 79                  
 payment_type          | 1                   
 fare_amount           | 15.6                
 extra                 | 3.5                 
 mta_tax               | 0.5                 
 tip_amount            | 3.5                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 1.0                 
 total_amount          | 24.1                
 congestion_surcharge  | 2.5                 
 Airport_fee           | 0.0                 
only showing top 1 row


In [None]:
# Data entry count: 3 million
sdf_yellow_example.count()

3076903

In [None]:
sdf_yellow_example.limit(5)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
1,2024-07-01 00:34:56,2024-07-01 00:46:49,1,3.2,1,N,140,79,1,15.6,3.5,0.5,3.5,0.0,1.0,24.1,2.5,0.0
2,2024-06-30 23:48:58,2024-07-01 00:28:04,1,19.48,2,N,132,113,2,70.0,0.0,0.5,0.0,0.0,1.0,75.75,2.5,1.75
2,2024-07-01 00:23:18,2024-07-01 00:29:51,1,1.18,1,N,237,145,1,8.6,1.0,0.5,2.72,0.0,1.0,16.32,2.5,0.0
1,2024-07-01 00:10:33,2024-07-01 00:27:31,0,9.1,1,N,138,164,1,36.6,10.25,0.5,12.05,0.0,1.0,60.4,2.5,1.75
1,2024-07-01 00:07:55,2024-07-01 00:34:34,1,17.7,2,N,132,263,1,70.0,1.75,0.5,10.0,6.94,1.0,90.19,0.0,1.75


In [None]:
sdf_yellow_example.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [21]:
sdf_yellow_example = (
    sdf_yellow_example
    .withColumn('passenger_count', F.col('passenger_count').cast('int'))
    .withColumn('RatecodeID', F.col('RatecodeID').cast('int'))
    .withColumn('payment_type', F.col('payment_type').cast('int'))
    .withColumn('store_and_fwd_flag', (F.col("store_and_fwd_flag") == 'Y').cast('boolean')))

In [22]:
consistent_col_casing_yellow = [F.col(col_name).alias(col_name.lower()) for col_name in sdf_yellow_example.columns]
sdf_yellow_example = sdf_yellow_example.select(*consistent_col_casing_yellow)

sdf_yellow_example = (sdf_yellow_example.withColumnRenamed("vendorid", "vendor_id").withColumnRenamed("ratecodeid", "ratecode_id").withColumnRenamed("pulocationid", "p_location_id").withColumnRenamed("dolocationid", "do_location_id").withColumnRenamed("trip_distance", "trip_miles").withColumnRenamed("dolocationid", "do_location_id").withColumnRenamed("trip_distance", "trip_miles").withColumnRenamed("tpep_pickup_datetime", "pickup_datetime").withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime"))


In [23]:
sdf_schema_yellow = sdf_yellow_example.schema
sdf_schema_yellow

StructType([StructField('vendor_id', IntegerType(), True), StructField('pickup_datetime', TimestampNTZType(), True), StructField('dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', IntegerType(), True), StructField('trip_miles', DoubleType(), True), StructField('ratecode_id', IntegerType(), True), StructField('store_and_fwd_flag', BooleanType(), True), StructField('p_location_id', IntegerType(), True), StructField('do_location_id', IntegerType(), True), StructField('payment_type', IntegerType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True)])

In [24]:
def align_datatypes_yellow(path, schema, month):
    df = spark.read.parquet(path)
    # fix column names
    df = df.select(*consistent_col_casing_yellow)
    df = (df.withColumnRenamed("vendorid", "vendor_id").withColumnRenamed("ratecodeid", "ratecode_id").withColumnRenamed("pulocationid", "pu_location_id").withColumnRenamed("dolocationid", "do_location_id").withColumnRenamed("trip_distance", "trip_miles").withColumnRenamed("dolocationid", "do_location_id").withColumnRenamed("trip_distance", "trip_miles").withColumnRenamed("tpep_pickup_datetime", "pickup_datetime").withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime"))
    # Cast every column to match the saved schema's data types
    for field in schema:
        if field.name in df.columns:  
            df = df.withColumn(field.name, F.col(field.name).cast(field.dataType))
            
    
    df.coalesce(1).write.mode('overwrite').parquet(f'../data/tlc_data/raw/yellow/2024-{str(month).zfill(2)}')
    return None

In [25]:
for month in range(7, 13):
    input_path = f'../data/tlc_data/yellow/{2024}-{str(month).zfill(2)}.parquet'
    align_datatypes_yellow(input_path, sdf_schema_yellow, month)

                                                                                

In [26]:
sdf_yellow = spark.read.parquet('/Users/tasneemzulaiqa/Documents/GitHub/project-1-individual-tasneemzulaiqa/data/tlc_data/raw/yellow/2024-07')
sdf_yellow.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- ratecode_id: integer (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- pu_location_id: integer (nullable = true)
 |-- do_location_id: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



### HVFHV data

In [74]:
sdf_fhv_example = spark.read.parquet('../data/tlc_data/hvfhv/2024-07.parquet')
sdf_fhv_example.show(1, vertical=True)

-RECORD 0-----------------------------------
 hvfhs_license_num    | HV0003              
 dispatching_base_num | B03404              
 originating_base_num | B03404              
 request_datetime     | 2024-07-01 00:13:16 
 on_scene_datetime    | 2024-07-01 00:18:28 
 pickup_datetime      | 2024-07-01 00:19:43 
 dropoff_datetime     | 2024-07-01 00:40:35 
 PULocationID         | 138                 
 DOLocationID         | 141                 
 trip_miles           | 8.84                
 trip_time            | 1252                
 base_passenger_fare  | 50.49               
 tolls                | 0.0                 
 bcf                  | 1.46                
 sales_tax            | 4.7                 
 congestion_surcharge | 2.75                
 airport_fee          | 2.5                 
 tips                 | 9.28                
 driver_pay           | 24.19               
 shared_request_flag  | N                   
 shared_match_flag    | N                   
 access_a_

In [28]:
import pyarrow.parquet as pq

for month in range(7, 13):
    path = f'../data/tlc_data/hvfhv/2024-{str(month).zfill(2)}.parquet'
    parquet_file = pq.ParquetFile(path)
    count = parquet_file.metadata.num_rows
    print(f"Month {month}: {count} rows")

Month 7: 19182934 rows
Month 8: 19128392 rows
Month 9: 19209788 rows
Month 10: 20028282 rows
Month 11: 19987533 rows
Month 12: 21068851 rows


In [75]:
sdf_fhv_example.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_

In [86]:
sdf_fhv_example = (sdf_fhv_example.withColumn('trip_time', F.col('trip_time').cast('double')).withColumn('shared_request_flag', (F.col("shared_request_flag") == 'Y').cast('boolean')).withColumn('shared_match_flag', (F.col("shared_match_flag") == 'Y').cast('boolean')).withColumn('access_a_ride_flag', (F.col("access_a_ride_flag") == 'Y').cast('boolean'))
    .withColumn('wav_request_flag', (F.col("wav_request_flag") == 'Y').cast('boolean'))
    .withColumn('wav_match_flag', (F.col("wav_match_flag") == 'Y').cast('boolean'))
    
    )

In [87]:
sdf_schema_fhv = sdf_fhv_example.schema
sdf_schema_fhv

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('originating_base_num', StringType(), True), StructField('request_datetime', TimestampNTZType(), True), StructField('on_scene_datetime', TimestampNTZType(), True), StructField('pickup_datetime', TimestampNTZType(), True), StructField('dropoff_datetime', TimestampNTZType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('trip_miles', DoubleType(), True), StructField('trip_time', DoubleType(), True), StructField('base_passenger_fare', DoubleType(), True), StructField('tolls', DoubleType(), True), StructField('bcf', DoubleType(), True), StructField('sales_tax', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True), StructField('tips', DoubleType(), True), StructField('driver_pay', DoubleType(), True), StructField('s

In [82]:
consistent_col_casing_fhv = [F.col(col_name).alias(col_name.lower()) for col_name in sdf_fhv_example.columns]

In [88]:
def read_with_schema_and_save_fhv(path, schema, month):
    df = spark.read.parquet(path)
    # fix column names
    df = df.select(*consistent_col_casing_fhv)
    df = (df.withColumnRenamed("pulocationID", "pu_location_id").withColumnRenamed("dolocationid", "do_location_id").withColumnRenamed("base_passenger_fare", "fare_amount"))
    # Cast every column to match the saved schema's data types
    for field in schema:
        if field.name in df.columns:  
            df = df.withColumn(field.name, F.col(field.name).cast(field.dataType))
            
    
    df.coalesce(1).write.mode('overwrite').parquet(f'../data/tlc_data/raw/hvfhv/2024-{str(month).zfill(2)}')
    return None

In [89]:
for month in range(7, 13):
    input_path = f'../data/tlc_data/hvfhv/{2024}-{str(month).zfill(2)}.parquet'
    read_with_schema_and_save_fhv(input_path, sdf_schema_fhv, month)

                                                                                

In [90]:
sdf_fhv = spark.read.parquet('/Users/tasneemzulaiqa/Documents/GitHub/project-1-individual-tasneemzulaiqa/data/tlc_data/raw/hvfhv/2024-07')
sdf_fhv.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- pu_location_id: integer (nullable = true)
 |-- do_location_id: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: boolean (nullable = true)
 |-- shared_match_flag: boolean (nullable = true)
 |-- access_a_

### Weather data

In [36]:
file_path = "/Users/tasneemzulaiqa/Documents/GitHub/project-1-individual-tasneemzulaiqa/data/weather_data/GHCNh_USW00094728_2024.psv"
df_weather = spark.read.option("header", "true") \
               .option("sep", "|") \
               .csv(file_path)

In [37]:
df_weather.show(100)

25/08/17 20:00:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----------+------------------+-------------------+--------+---------+---------+-----------+----------------------------+------------------------+-----------------------+-----------------------+-----------------------------+---------------------+--------------------------------------+----------------------------------+---------------------------------+---------------------------------+---------------------------------------+----------------------+---------------------------------------+-----------------------------------+----------------------------------+----------------------------------+----------------------------------------+------------------+-----------------------------------+-------------------------------+------------------------------+------------------------------+------------------------------------+--------------+-------------------------------+---------------------------+--------------------------+--------------------------+--------------------------------+----------+--

In [38]:
df_weather.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- Station_name: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- Elevation: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- temperature_Measurement_Code: string (nullable = true)
 |-- temperature_Quality_Code: string (nullable = true)
 |-- temperature_Report_Type: string (nullable = true)
 |-- temperature_Source_Code: string (nullable = true)
 |-- temperature_Source_Station_ID: string (nullable = true)
 |-- dew_point_temperature: string (nullable = true)
 |-- dew_point_temperature_Measurement_Code: string (nullable = true)
 |-- dew_point_temperature_Quality_Code: string (nullable = true)
 |-- dew_point_temperature_Report_Type: string (nullable = true)
 |-- dew_point_temperature_Source_Code: string (nullable = true)
 |-- dew_point_temperature_Source_Station_ID: string (nullable = true)
 |-- station_level_pressure: strin

In [39]:
df_weather.count()

11222

In [41]:
from pyspark.sql.functions import month, col

# Keep only rows from July (7) to December (12)
df_weather = df_weather.filter(
    (month(col("DATE")) >= 7) & (month(col("DATE")) <= 12)
)

In [42]:
df_weather.count()

5550

In [43]:
from pyspark.sql import functions as F


df_weather = df_weather.withColumn(
    "weather",
    F.coalesce(
        F.col("pres_wx_MW1"), F.col("pres_wx_MW2"), F.col("pres_wx_MW3"),
        F.col("pres_wx_AU1"), F.col("pres_wx_AU2"), F.col("pres_wx_AU3"),
        F.col("pres_wx_AW1"), F.col("pres_wx_AW2"), F.col("pres_wx_AW3")
    )
)

df_weather = df_weather.select("Station_name", "DATE", "temperature", "wind_speed", "precipitation", "weather")

In [44]:
df_weather.printSchema()

root
 |-- Station_name: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- wind_speed: string (nullable = true)
 |-- precipitation: string (nullable = true)
 |-- weather: string (nullable = true)



In [45]:
df_weather = (
    df_weather
    .withColumn('DATE', F.col('DATE').cast('timestamp'))
    .withColumn('temperature', F.col('temperature').cast('double'))
    .withColumn('wind_speed', F.col('wind_speed').cast('double'))
    .withColumn('precipitation', F.col('precipitation').cast('double'))
)

In [48]:
df_weather = (df_weather.withColumnRenamed("DATE", "date_time").withColumnRenamed('Station_name','station_name'))

In [49]:
df_weather.show(100)

+------------------+-------------------+-----------+----------+-------------+-------+
|      station_name|          date_time|temperature|wind_speed|precipitation|weather|
+------------------+-------------------+-----------+----------+-------------+-------+
|NY CITY CNTRL PARK|2024-07-01 00:04:00|       22.8|       0.0|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 00:41:00|       22.8|       0.0|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 00:49:00|       23.0|       1.5|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 00:51:00|       22.8|       0.0|          0.0|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 01:49:00|       23.0|       2.6|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 01:51:00|       22.8|       2.6|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-07-01 02:51:00|       22.8|       2.6|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-07-01 03:51:00|       22.2|      NULL|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-07-01 04:51:00|       21.1|  

In [50]:
df_weather.filter(F.col('temperature').isNull()).count()

41

In [51]:
df_weather.filter(F.col('precipitation').isNull()).count()

634

In [52]:
df_weather.filter(F.col('wind_speed').isNull()).count()

552

In [53]:
df_weather.filter(F.col('temperature').isNull()).show(500)

+------------------+-------------------+-----------+----------+-------------+-------+
|      station_name|          date_time|temperature|wind_speed|precipitation|weather|
+------------------+-------------------+-----------+----------+-------------+-------+
|NY CITY CNTRL PARK|2024-12-29 18:51:00|       NULL|       2.1|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-29 19:51:00|       NULL|       3.6|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-29 20:51:00|       NULL|       3.1|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-29 21:51:00|       NULL|       3.1|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-29 22:51:00|       NULL|      NULL|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-29 23:51:00|       NULL|       3.1|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-30 00:51:00|       NULL|       2.6|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-30 01:51:00|       NULL|       5.1|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-12-30 02:51:00|       NULL|  

In [54]:
df_weather = df_weather.filter(F.col("temperature").isNotNull())

In [55]:
df_weather.show(100)

+------------------+-------------------+-----------+----------+-------------+-------+
|      station_name|          date_time|temperature|wind_speed|precipitation|weather|
+------------------+-------------------+-----------+----------+-------------+-------+
|NY CITY CNTRL PARK|2024-07-01 00:04:00|       22.8|       0.0|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 00:41:00|       22.8|       0.0|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 00:49:00|       23.0|       1.5|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 00:51:00|       22.8|       0.0|          0.0|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 01:49:00|       23.0|       2.6|         NULL|   BR:1|
|NY CITY CNTRL PARK|2024-07-01 01:51:00|       22.8|       2.6|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-07-01 02:51:00|       22.8|       2.6|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-07-01 03:51:00|       22.2|      NULL|          0.0|   NULL|
|NY CITY CNTRL PARK|2024-07-01 04:51:00|       21.1|  

In [56]:
df_weather = df_weather.drop("weather")

In [57]:
df_weather.filter(F.col('precipitation') == 0.0).count()

4173

In [58]:
df_weather.select(F.col('precipitation')).distinct().show(100)

+-------------+
|precipitation|
+-------------+
|         10.2|
|          0.0|
|         11.4|
|          3.5|
|          0.2|
|          6.6|
|          1.7|
|         12.9|
|          0.8|
|          4.3|
|          4.5|
|          0.7|
|          2.3|
|         NULL|
|          2.5|
|          1.0|
|          2.7|
|         14.2|
|          4.1|
|          2.8|
|          9.3|
|         28.1|
|          4.0|
|         19.3|
|          0.5|
|          3.8|
|         20.3|
|         32.5|
|          1.3|
|         10.7|
|          5.3|
|         21.0|
|         12.2|
|          7.6|
|         22.6|
|          3.0|
|         29.5|
|          2.0|
|         10.9|
|          1.2|
|          1.8|
|          1.5|
|          6.8|
|         12.7|
|          4.8|
|         10.4|
|          4.6|
|          6.0|
|          0.3|
|          3.6|
|          5.0|
|         11.9|
|          5.5|
|          3.3|
|          7.1|
|          5.6|
|          8.1|
|          2.2|
|          5.8|
+-------

In [59]:
df_weather.filter(F.col('precipitation').isNull()).show(500)

+------------------+-------------------+-----------+----------+-------------+
|      station_name|          date_time|temperature|wind_speed|precipitation|
+------------------+-------------------+-----------+----------+-------------+
|NY CITY CNTRL PARK|2024-07-01 00:04:00|       22.8|       0.0|         NULL|
|NY CITY CNTRL PARK|2024-07-01 00:41:00|       22.8|       0.0|         NULL|
|NY CITY CNTRL PARK|2024-07-01 00:49:00|       23.0|       1.5|         NULL|
|NY CITY CNTRL PARK|2024-07-01 01:49:00|       23.0|       2.6|         NULL|
|NY CITY CNTRL PARK|2024-07-05 02:49:00|       23.0|       0.0|         NULL|
|NY CITY CNTRL PARK|2024-07-05 04:58:00|       23.3|       0.0|         NULL|
|NY CITY CNTRL PARK|2024-07-05 05:17:00|       23.3|       0.0|         NULL|
|NY CITY CNTRL PARK|2024-07-05 05:34:00|       23.3|       1.5|         NULL|
|NY CITY CNTRL PARK|2024-07-05 05:41:00|       23.3|       0.0|         NULL|
|NY CITY CNTRL PARK|2024-07-05 05:49:00|       23.0|       0.0| 

In [60]:
df_weather = df_weather.fillna({'precipitation': 0})

In [61]:
df_weather.filter(F.col('wind_speed') == 0.0).count()

1308

In [62]:
df_weather.select('wind_speed').describe().show()


+-------+------------------+
|summary|        wind_speed|
+-------+------------------+
|  count|              4965|
|   mean| 1.905760322255785|
| stddev|1.4710991833212028|
|    min|               0.0|
|    max|               8.2|
+-------+------------------+



In [63]:
df_weather.filter(F.col('wind_speed').isNull()).show(500)

+------------------+-------------------+-----------+----------+-------------+
|      station_name|          date_time|temperature|wind_speed|precipitation|
+------------------+-------------------+-----------+----------+-------------+
|NY CITY CNTRL PARK|2024-07-01 03:51:00|       22.2|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-01 11:51:00|       19.4|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-01 15:51:00|       23.3|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-02 09:51:00|       19.4|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-03 16:51:00|       27.8|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-04 00:51:00|       22.8|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-04 04:51:00|       21.7|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-04 05:51:00|       21.1|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-04 06:51:00|       21.1|      NULL|          0.0|
|NY CITY CNTRL PARK|2024-07-04 10:51:00|       22.2|      NULL| 

In [64]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=["wind_speed"],    
    outputCols=["wind_speed"],   
    strategy="median"            
)


df_weather = imputer.fit(df_weather).transform(df_weather)

df_weather.filter(F.col("wind_speed").isNull()).count()

0

In [65]:
output_path = "../data/tlc_data/raw/cleaned/curated/weather"

df_weather.write.mode("overwrite").parquet(output_path)