In [18]:
from urllib.request import urlretrieve
import os
import pandas as pd

In [2]:
output_relative_dir = '../data/'

# check if path exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# creating paths for each type of data set now, for each type of data set we will need, we will create the paths
for target_dir in ('raw', 'curated', 'landing'):
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)


output_relative_dir = '../data/landing/'     
target_dir = ('tlc_data')

if not os.path.exists(output_relative_dir + target_dir):
    os.makedirs(output_relative_dir + target_dir)

In [3]:
YEAR = '2023'
MONTHS = range(7, 13)

In [4]:
# this is the URL template as of 07/2023
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"#year-month.parquet

In [5]:
# data output directory is `data/tlc_data/`
tlc_output_dir = '../data/landing/tlc_data/'

for month in MONTHS:
    # 0-fill i.e 1 -> 01, 2 -> 02, etc
    month = str(month).zfill(2) 
    print(f"Begin month {month}")
    
    # generate url
    url = f'{URL_TEMPLATE}{YEAR}-{month}.parquet'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{YEAR}-{month}.parquet"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed month {month}")

Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin month 10
Completed month 10
Begin month 11
Completed month 11
Begin month 12
Completed month 12


In [11]:
import base64
import traceback
RED = '\033[91m'
GREEN = '\033[92m'
BOLD = '\033[1m'
RESET = '\033[0m'

try:
    from pyspark.sql import SparkSession
    # Create a spark session (which will run spark jobs)
    spark = (
        SparkSession.builder.appName("MAST30034 Tutorial")
        .config("spark.sql.repl.eagerEval.enabled", True) 
        .config("spark.sql.parquet.cacheMetadata", "true")
        .config("spark.sql.session.timeZone", "Etc/UTC")
        .getOrCreate()
    )
    print(f"{GREEN}{BOLD}Success - your environment is set up.{RESET}")
except Exception as e:
    print(f"{RED}{BOLD}Something went wrong. Reinstall and try again.{RESET}")
    traceback.print_exc()

[92m[1mSuccess - your environment is set up.[0m


## Inspecting Data

In [12]:
sdf_jul = spark.read.parquet('../data/landing/tlc_data/2023-07.parquet')
sdf_jul.show(1, vertical=True, truncate=100)

-RECORD 0-----------------------------------
 hvfhs_license_num    | HV0003              
 dispatching_base_num | B03404              
 originating_base_num | B03404              
 request_datetime     | 2023-07-01 00:04:21 
 on_scene_datetime    | 2023-07-01 00:07:59 
 pickup_datetime      | 2023-07-01 00:08:30 
 dropoff_datetime     | 2023-07-01 00:33:33 
 PULocationID         | 72                  
 DOLocationID         | 26                  
 trip_miles           | 4.79                
 trip_time            | 1503                
 base_passenger_fare  | 22.34               
 tolls                | 0.0                 
 bcf                  | 0.61                
 sales_tax            | 1.98                
 congestion_surcharge | 0.0                 
 airport_fee          | 0.0                 
 tips                 | 0.0                 
 driver_pay           | 20.42               
 shared_request_flag  | N                   
 shared_match_flag    | N                   
 access_a_

In [13]:
sdf_jul.limit(5)

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
HV0003,B03404,B03404,2023-07-01 00:04:21,2023-07-01 00:07:59,2023-07-01 00:08:30,2023-07-01 00:33:33,72,26,4.79,1503,22.34,0.0,0.61,1.98,0.0,0.0,0.0,20.42,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:40:25,2023-07-01 00:40:35,2023-07-01 00:42:10,2023-07-01 01:08:06,26,37,6.4,1556,25.83,0.0,0.71,2.29,0.0,0.0,0.0,23.03,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:20:31,2023-07-01 00:24:05,2023-07-01 00:25:00,2023-07-01 00:42:38,263,232,5.71,1058,26.51,0.0,0.73,2.35,2.75,0.0,6.46,18.61,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:42:50,2023-07-01 00:47:37,2023-07-01 00:48:57,2023-07-01 00:57:04,79,233,1.75,487,16.32,0.0,0.45,1.45,2.75,0.0,0.0,8.13,N,N,,N,N
HV0005,B03406,,2023-07-01 00:00:17,,2023-07-01 00:05:35,2023-07-01 00:36:07,88,237,7.218,1832,83.97,0.0,2.31,7.45,2.75,0.0,0.0,59.05,N,N,N,N,N


In [15]:
sdf_all = spark.read.parquet('../data/landing/tlc_data')

In [16]:
sdf_jul.count(), sdf_all.count()

(19132131, 117277281)

In [17]:
sdf_jul.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_