# Data Access

In [0]:
spark.conf.set("fs.azure.account.auth.type..dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type..dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id..dfs.core.windows.net", "")
spark.conf.set("fs.azure.account.oauth2.client.secret..dfs.core.windows.net", "")
spark.conf.set("fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net", "https://login.microsoftonline.com//oauth2/token")
# Azure OAuth configuration - Credentials stored securely in Databricks secrets
# Actual client_id, client_secret and tenant_id removed for security
     

# Datebase Creation

In [0]:
%sql
CREATE DATABASE golddatabase

# Data Reading and Writing and Creating Delta Tables

In [0]:
dbutils.fs.ls('abfss://silver@nyctaxidataproject1.dfs.core.windows.net')

[FileInfo(path='abfss://silver@nyctaxidataproject1.dfs.core.windows.net/trip_type/', name='trip_type/', size=0, modificationTime=1764841031000),
 FileInfo(path='abfss://silver@nyctaxidataproject1.dfs.core.windows.net/trip_zone/', name='trip_zone/', size=0, modificationTime=1764841033000),
 FileInfo(path='abfss://silver@nyctaxidataproject1.dfs.core.windows.net/trips2024data/', name='trips2024data/', size=0, modificationTime=1764841880000)]

# Data Reading

**Importing Libraries**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Reading Parquet Data

**Data Zone**

**Variables**

In [0]:
silver = "abfss://silver@nyctaxidataproject1.dfs.core.windows.net"
gold = "abfss://gold@nyctaxidataproject1.dfs.core.windows.net"

In [0]:
df_zone = spark.read.format('parquet')\
               .option('inferSchema',True)\
               .option('header',True)\
               .load(f'{silver}/trip_zone')

In [0]:
df_zone.display()

LocationID,Borough,Zone,service_zone,zone1,zone2
1,EWR,Newark Airport,EWR,Newark Airport,
2,Queens,Jamaica Bay,Boro Zone,Jamaica Bay,
3,Bronx,Allerton/Pelham Gardens,Boro Zone,Allerton,Pelham Gardens
4,Manhattan,Alphabet City,Yellow Zone,Alphabet City,
5,Staten Island,Arden Heights,Boro Zone,Arden Heights,
6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone,Arrochar,Fort Wadsworth
7,Queens,Astoria,Boro Zone,Astoria,
8,Queens,Astoria Park,Boro Zone,Astoria Park,
9,Queens,Auburndale,Boro Zone,Auburndale,
10,Queens,Baisley Park,Boro Zone,Baisley Park,


**Trip Type**

In [0]:
df_type = spark.read.format('parquet')\
               .option('inferSchema',True)\
               .option('header',True)\
               .load(f'{silver}/trip_type')

In [0]:
df_type.display()

trip_type,trip_description
1,Street-hail
2,Dispatch


In [0]:
df_type.write.format("delta")\
    .mode("overwrite")\
    .save(f"{gold}/trip_type")


**Trips Data**

In [0]:
df_trips = spark.read.format('parquet')\
               .option('inferSchema',True)\
               .option('header',True)\
               .load(f'{silver}/trips2024data')

In [0]:
df_trips.display()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,trip_pickup_date,trip_pickup_year,trip_pickup_month,trip_dropoff_date,trip_dropoff_year,trip_dropoff_month
2,2024-10-01T00:30:44,2024-10-01T00:48:26,1,3.0,1,N,162,246,1,18.4,1.0,0.5,1.5,0.0,1.0,24.9,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:12:20,2024-10-01T00:25:25,1,2.2,1,N,48,236,1,14.2,3.5,0.5,3.8,0.0,1.0,23.0,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:04:46,2024-10-01T00:13:52,1,2.7,1,N,142,24,1,13.5,3.5,0.5,3.7,0.0,1.0,22.2,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:12:10,2024-10-01T00:23:01,1,3.1,1,N,233,75,1,14.2,3.5,0.5,2.0,0.0,1.0,21.2,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:30:22,2024-10-01T00:30:39,1,0.0,1,N,262,262,3,3.0,3.5,0.5,0.0,0.0,1.0,8.0,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
2,2024-10-01T00:31:20,2024-10-01T00:36:00,2,0.97,1,N,137,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:42:57,2024-10-01T00:49:01,1,1.3,1,N,142,48,1,7.9,3.5,0.5,2.55,0.0,1.0,15.45,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:59:55,2024-10-01T01:02:24,1,0.5,1,N,230,161,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:00:47,2024-10-01T00:04:22,0,1.1,1,N,142,237,1,7.2,3.5,0.5,3.0,0.0,1.0,15.2,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10
1,2024-10-01T00:17:36,2024-10-01T00:26:22,1,2.2,1,N,162,145,1,11.4,3.5,0.5,3.3,0.0,1.0,19.7,2.5,0.0,2024-10-01,2024,10,2024-10-01,2024,10


In [0]:
df_trips.write.format("delta")\
    .mode("overwrite")\
    .save(f"{gold}/trips2024data")
