# Data Accessing

In [0]:
spark.conf.set("fs.azure.account.auth.type.{storage-account}.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.{storage-account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.{storage-account}.dfs.core.windows.net", "<application-id>")
spark.conf.set("fs.azure.account.oauth2.client.secret.{storage-account}.dfs.core.windows.net", "service_credential")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.{storage-account}.dfs.core.windows.net", "https://login.microsoftonline.com/<directory-id>/oauth2/token")

# Database Creation

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS taxi_db;

# Importing Libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Data Reading

## 1. Trip Type Data

In [0]:
df_trip_type_data = spark.read.format('parquet')\
                              .load('abfss://{container-name}@{storage-account}.dfs.core.windows.net/trip_type_data')

In [0]:
display(df_trip_type_data)

trip_type,trip_description
1,Street-hail
2,Dispatch


## 2. Trip Zone Data

In [0]:
df_trip_zone_data = spark.read.format('parquet')\
                              .load('abfss://{container-name}@{storage-account}.dfs.core.windows.net/trip_zone_data')

In [0]:
display(df_trip_zone_data.limit(30))

<bound method DataFrame.limit of DataFrame[LocationID: int, Borough: string, Zone: string, service_zone: string, zone1: string, zone2: string]>

## 3. Trip 2025 Data

In [0]:
df_trip_2025_trip_data = spark.read.format('parquet')\
                              .load('abfss://{container-name}@{storage-account}.dfs.core.windows.net/trip_2025_trip_data')

In [0]:
display(df_trip_2025_trip_data.limit(30))

VendorID,store_and_fwd_flag,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,trip_date,pick-up_time,drop-off_time
2,N,25,216,1,9.34,44.3,1.0,0.5,0.0,0.0,,1.0,46.8,1,1.0,0.0,2025-05-01,00:17:04,00:56:06
2,N,160,129,1,2.95,16.3,1.0,0.5,0.0,0.0,,1.0,18.8,2,1.0,0.0,2025-05-01,00:56:16,01:10:26
1,N,260,179,1,3.0,18.4,1.0,1.5,0.0,0.0,,1.0,20.9,2,1.0,0.0,2025-05-01,00:24:49,00:42:29
2,N,130,216,1,1.61,9.3,1.0,0.5,0.0,0.0,,1.0,11.8,2,1.0,0.0,2025-05-01,00:27:11,00:33:21
2,N,244,151,2,3.44,15.6,1.0,0.5,4.52,0.0,,1.0,22.62,1,1.0,0.0,2025-05-01,00:32:59,00:41:34
2,N,42,41,1,0.66,6.5,1.0,0.5,2.0,0.0,,1.0,11.0,1,1.0,0.0,2025-04-30,23:58:57,00:02:31
2,N,240,265,1,1.63,9.3,1.0,0.5,0.0,0.0,,1.0,11.8,1,1.0,0.0,2025-05-01,00:38:03,00:43:28
2,N,129,70,1,2.15,13.5,1.0,0.5,0.0,0.0,,1.0,16.0,2,1.0,0.0,2025-05-01,00:13:48,00:26:19
2,N,244,42,1,2.87,15.6,1.0,0.5,0.0,0.0,,1.0,18.1,2,1.0,0.0,2025-05-01,00:08:00,00:22:00
2,N,75,262,1,1.52,10.7,1.0,0.5,2.39,0.0,,1.0,18.34,1,1.0,2.75,2025-05-01,00:48:03,00:57:01


# Writing in Delta Table

## 1. Trip Type Data

In [0]:
df_trip_type_data.write.format("delta") \
  .mode("overwrite") \
  .saveAsTable("taxi_db.trip_type_delta_data")

## 2. Trip Zone Data

In [0]:
df_trip_zone_data.write.format("delta") \
  .mode("overwrite") \
  .saveAsTable("taxi_db.trip_zone_delta_data")

## 3. Trip 2025 Data

In [0]:
df_trip_2025_trip_data.write.format("delta") \
  .mode("overwrite") \
  .saveAsTable("taxi_db.trip_2025_delta_data")

# Delta Tables

In [0]:
%sql
DESCRIBE DATABASE taxi_db;
    
SHOW TABLES IN taxi_db;

database,tableName,isTemporary
taxi_db,trip_2025_delta_data,False
taxi_db,trip_type_delta_data,False
taxi_db,trip_zone_delta_data,False


## 1. Trip Type Data Delta Table

In [0]:
%sql
SELECT * FROM taxi_db.trip_type_delta_data;

trip_type,trip_description
1,Street-hail
2,Dispatch


## 2. Trip Zone Data Delta Table

In [0]:
%sql
SELECT * FROM taxi_db.trip_zone_delta_data LIMIT(30);

LocationID,Borough,Zone,service_zone,zone1,zone2
1,EWR,Newark Airport,EWR,Newark Airport,
2,Queens,Jamaica Bay,Boro Zone,Jamaica Bay,
3,Bronx,Allerton/Pelham Gardens,Boro Zone,Allerton,Pelham Gardens
4,Manhattan,Alphabet City,Yellow Zone,Alphabet City,
5,Staten Island,Arden Heights,Boro Zone,Arden Heights,
6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone,Arrochar,Fort Wadsworth
7,Queens,Astoria,Boro Zone,Astoria,
8,Queens,Astoria Park,Boro Zone,Astoria Park,
9,Queens,Auburndale,Boro Zone,Auburndale,
10,Queens,Baisley Park,Boro Zone,Baisley Park,


## 3. Trip 2025 Data Delta Table

In [0]:
%sql
SELECT * FROM taxi_db.trip_2025_delta_data LIMIT(30);

VendorID,store_and_fwd_flag,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,trip_date,pick-up_time,drop-off_time
2,N,25,216,1,9.34,44.3,1.0,0.5,0.0,0.0,,1.0,46.8,1,1,0.0,2025-05-01,00:17:04,00:56:06
2,N,160,129,1,2.95,16.3,1.0,0.5,0.0,0.0,,1.0,18.8,2,1,0.0,2025-05-01,00:56:16,01:10:26
1,N,260,179,1,3.0,18.4,1.0,1.5,0.0,0.0,,1.0,20.9,2,1,0.0,2025-05-01,00:24:49,00:42:29
2,N,130,216,1,1.61,9.3,1.0,0.5,0.0,0.0,,1.0,11.8,2,1,0.0,2025-05-01,00:27:11,00:33:21
2,N,244,151,2,3.44,15.6,1.0,0.5,4.52,0.0,,1.0,22.62,1,1,0.0,2025-05-01,00:32:59,00:41:34
2,N,42,41,1,0.66,6.5,1.0,0.5,2.0,0.0,,1.0,11.0,1,1,0.0,2025-04-30,23:58:57,00:02:31
2,N,240,265,1,1.63,9.3,1.0,0.5,0.0,0.0,,1.0,11.8,1,1,0.0,2025-05-01,00:38:03,00:43:28
2,N,129,70,1,2.15,13.5,1.0,0.5,0.0,0.0,,1.0,16.0,2,1,0.0,2025-05-01,00:13:48,00:26:19
2,N,244,42,1,2.87,15.6,1.0,0.5,0.0,0.0,,1.0,18.1,2,1,0.0,2025-05-01,00:08:00,00:22:00
2,N,75,262,1,1.52,10.7,1.0,0.5,2.39,0.0,,1.0,18.34,1,1,2.75,2025-05-01,00:48:03,00:57:01
