In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/20 23:00:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# import library needed
from urllib.request import urlretrieve
import os

# from the current `scripts` directory, go back one level to the `MAST30034-project-1` landing directory 
output_relative_dir = '../data/landing/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ('taxi_data', 'weather', 'geopandas'): 
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

In [3]:
# this is the URL template for yellow taxi data
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"#year-month.parquet

In [4]:
# data output directory is `data/landing/`
tlc_output_dir = output_relative_dir + 'taxi_data'

for year in range(2022, 2024):
    if year == 2022:
        MONTH = range(1, 13)
    if year == 2023:
        MONTH = range(2, 3)

    for month in MONTH: 
        month = str(month).zfill(2) 
        print(f"Begin year {year} month {month}")
    
        # generate url
        url = f'{URL_TEMPLATE}{str(year)}-{month}.parquet'
        # generate output location and filename
        output_dir = f"{tlc_output_dir}/{str(year)}-{month}.parquet"
        # download
        urlretrieve(url, output_dir) 

        print(f"Completed year {year} month {month}")


Begin year 2022 month 01
Completed year 2022 month 01
Begin year 2022 month 02
Completed year 2022 month 02
Begin year 2022 month 03
Completed year 2022 month 03
Begin year 2022 month 04
Completed year 2022 month 04
Begin year 2022 month 05
Completed year 2022 month 05
Begin year 2022 month 06
Completed year 2022 month 06
Begin year 2022 month 07
Completed year 2022 month 07
Begin year 2022 month 08
Completed year 2022 month 08
Begin year 2022 month 09
Completed year 2022 month 09
Begin year 2022 month 10
Completed year 2022 month 10
Begin year 2022 month 11
Completed year 2022 month 11
Begin year 2022 month 12
Completed year 2022 month 12
Begin year 2023 month 02
Completed year 2023 month 02


In [5]:
''' This code is for downloading weather data from Visual Crossing website '''

weather_output_dir = output_relative_dir + 'weather'

# this is the URL for weather
url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/new%20york%20city/2021-12-31/2022-12-31?unitGroup=metric&elements=datetime%2CdatetimeEpoch%2Ctemp%2Cprecip%2Csnow%2Cwindspeed%2Cconditions&include=hours&key=5R8VYKM4X9FTJD8GPHGNTHUNE&contentType=csv"
urlretrieve(url, f'{weather_output_dir}/2022.csv')

('../data/landing/weather/2022.csv',
 <http.client.HTTPMessage at 0x7fa768720a60>)

In [6]:
''' This code is for downloading shapefile data from tlc website '''

shapefile_output_dir = output_relative_dir + 'geopandas'

# this is the URL for shapefile 
url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv"
url2 = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"

urlretrieve(url, f'{shapefile_output_dir}/taxi_zones.csv')
urlretrieve(url2, f'{shapefile_output_dir}/taxi_shapefile.zip')

('../data/landing/geopandas/taxi_shapefile.zip',
 <http.client.HTTPMessage at 0x7fa76870bdc0>)

In [7]:
''' This code is to extract the zipfile downloaded, this is to access the shapefile
model inside the zipfile ''' 

import zipfile
import os
from io import BytesIO

# Initialize a Spark session
spark = SparkSession.builder.appName("UnzipShapefile").getOrCreate()

# Specify paths
zip_file_path = "../data/landing/geopandas/taxi_shapefile.zip"
extracted_dir = "../data/landing/geopandas/"

# Create the DataFrame with the binary data of the zip file
df = spark.read.format("binaryFile").load(zip_file_path)

# Convert the binary data to a bytearray
byte_array = bytearray(df.select("content").first()[0])

# Convert the bytearray to a BytesIO stream
byte_stream = BytesIO(byte_array)

# Extract the zip contents to the specified directory
with zipfile.ZipFile(byte_stream, "r") as zip_ref:
    zip_ref.extractall(extracted_dir)

23/08/20 23:02:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                