# Data Access

In [0]:

spark.conf.set("fs.azure.account.auth.type..dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type..dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id..dfs.core.windows.net", "")
spark.conf.set("fs.azure.account.oauth2.client.secret..dfs.core.windows.net", "")
spark.conf.set("fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net", "https://login.microsoftonline.com//oauth2/token")
# Azure OAuth configuration - Credentials stored securely in Databricks secrets
# Actual client_id, client_secret and tenant_id removed for security

# Datebase Creation

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS golddatabase

# Reading Silver layer data

In [0]:
# Read from Silver layer
silver_path = "abfss://silver@nyctaxidataproject1.dfs.core.windows.net"

df_trip_type = spark.read.parquet(f"{silver_path}/trip_type")
df_trip_zone = spark.read.parquet(f"{silver_path}/trip_zone")  
df_trips = spark.read.parquet(f"{silver_path}/trips2024data")

## Save to Gold Storage Container

## Importing Libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## Create Dimension & Fact Tables (Databricks Catalog + Gold Storage)

#### SAVE TO DATABRICKS CATALOG (for Power BI/Querying)

In [0]:
# Trip Type Dimension
df_trip_type.write.format("delta")\
    .mode("overwrite")\
    .saveAsTable("golddatabase.dim_trip_type")

# Trip Zone Dimension  
df_trip_zone.write.format("delta")\
    .mode("overwrite")\
    .saveAsTable("golddatabase.dim_trip_zone")

In [0]:
# Create fact table with partitioning
df_trips.write.format("delta")\
    .mode("overwrite")\
    .partitionBy("trip_pickup_month")\
    .option("delta.autoOptimize.optimizeWrite", "true")\
    .option("delta.autoOptimize.autoCompact", "true")\
    .saveAsTable("golddatabase.fact_trips")

#### ALSO SAVE TO GOLD STORAGE CONTAINER (for backup/archival)

In [0]:
gold_path = "abfss://gold@nyctaxidataproject1.dfs.core.windows.net"

df_trip_type.write.format("delta")\
    .mode("overwrite")\
    .save(f"{gold_path}/dim_trip_type")

df_trip_zone.write.format("delta")\
    .mode("overwrite")\
    .save(f"{gold_path}/dim_trip_zone")

df_trips.write.format("delta")\
    .mode("overwrite")\
    .partitionBy("trip_pickup_month")\
    .save(f"{gold_path}/fact_trips")

In [0]:
%sql
-- Optimize table for query performance
OPTIMIZE golddatabase.fact_trips ZORDER BY (PULocationID, DOLocationID, trip_pickup_date);

-- Create statistics for query optimizer
ANALYZE TABLE golddatabase.fact_trips COMPUTE STATISTICS FOR ALL COLUMNS;

In [0]:
%sql
-- Monthly summary view
CREATE OR REPLACE VIEW golddatabase.vw_monthly_summary AS
SELECT 
    trip_pickup_year,
    trip_pickup_month,
    COUNT(*) as total_trips,
    SUM(fare_amount) as total_revenue,
    AVG(trip_duration_minutes) as avg_duration,
    AVG(tip_percentage) as avg_tip_percentage
FROM golddatabase.fact_trips
WHERE is_valid_fare = TRUE AND is_valid_passenger_count = TRUE
GROUP BY trip_pickup_year, trip_pickup_month
ORDER BY trip_pickup_year, trip_pickup_month;

In [0]:
%sql
SHOW DATABASES;

databaseName
default
golddatabase
information_schema


In [0]:
%sql
SHOW TABLES IN golddatabase;

database,tableName,isTemporary
golddatabase,dim_trip_type,False
golddatabase,dim_trip_zone,False
golddatabase,fact_trips,False
golddatabase,vw_monthly_summary,False
,_sqldf,True


In [0]:
%sql
DESCRIBE DETAIL golddatabase.fact_trips;

format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,dc2a5135-36db-4c14-a92f-8f287df2d591,nyctaxi_databricks.golddatabase.fact_trips,,abfss://unity-catalog-storage@dbstoragewfig4i4coksi6.dfs.core.windows.net/60489139400205/__unitystorage/catalogs/56d8103b-a56e-4809-ad86-c34e1240d76a/tables/b1b220cf-3de0-47eb-8efb-fbd49f86332f,2025-12-07T21:59:00.865Z,2025-12-07T22:01:33Z,List(trip_pickup_month),List(),12,889071410,"Map(delta.autoOptimize.autoCompact -> true, delta.autoOptimize.optimizeWrite -> true, delta.parquet.compression.codec -> zstd, delta.enableDeletionVectors -> true)",3,7,"List(appendOnly, deletionVectors, invariants, timestampNtz)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


In [0]:
%sql
SELECT * FROM golddatabase.fact_trips LIMIT 10;

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,trip_pickup_date,trip_pickup_year,trip_pickup_month,trip_dropoff_date,trip_dropoff_year,trip_dropoff_month,trip_duration_minutes,pickup_hour,time_of_day,is_valid_fare,is_valid_passenger_count,tip_percentage,revenue_per_mile,data_quality_issue
1,2024-06-01T00:03:46,2024-06-01T00:31:23,1,12.5,1,N,138,195,1,48.5,7.75,0.5,11.55,0.0,1.0,69.3,0.0,1.75,2024-06-01,2024,6,2024-06-01,2024,6,27.616666666666667,0,Night,True,True,23.81443298969072,3.88,
2,2024-06-01T00:55:22,2024-06-01T01:08:24,1,4.34,1,N,138,7,1,20.5,6.0,0.5,8.4,0.0,1.0,38.15,0.0,1.75,2024-06-01,2024,6,2024-06-01,2024,6,13.033333333333331,0,Night,True,True,40.97560975609756,4.723502304147465,
1,2024-06-01T00:23:53,2024-06-01T00:32:35,1,1.3,1,N,166,41,1,10.0,1.0,0.5,3.1,0.0,1.0,15.6,0.0,0.0,2024-06-01,2024,6,2024-06-01,2024,6,8.7,0,Night,True,True,31.0,7.692307692307692,
1,2024-06-01T00:32:24,2024-06-01T00:40:06,1,1.2,1,N,148,114,1,8.6,3.5,0.5,0.2,0.0,1.0,13.8,2.5,0.0,2024-06-01,2024,6,2024-06-01,2024,6,7.7,0,Night,True,True,2.3255813953488373,7.166666666666667,
1,2024-06-01T00:51:38,2024-06-01T00:58:17,1,1.0,1,N,148,249,1,7.2,3.5,0.5,2.0,0.0,1.0,14.2,2.5,0.0,2024-06-01,2024,6,2024-06-01,2024,6,6.65,0,Night,True,True,27.77777777777778,7.2,
2,2024-06-01T00:26:13,2024-06-01T00:37:21,1,1.5,1,N,48,229,1,11.4,1.0,0.5,2.0,0.0,1.0,18.4,2.5,0.0,2024-06-01,2024,6,2024-06-01,2024,6,11.133333333333333,0,Night,True,True,17.543859649122805,7.6,
2,2024-06-01T00:01:04,2024-06-01T00:57:48,1,18.41,2,N,132,48,1,70.0,0.0,0.5,0.15,6.94,1.0,82.84,2.5,1.75,2024-06-01,2024,6,2024-06-01,2024,6,56.73333333333333,0,Night,True,True,0.2142857142857143,3.802281368821293,
1,2024-06-01T00:43:55,2024-06-01T00:49:03,4,1.4,1,N,140,236,1,7.9,3.5,0.5,2.6,0.0,1.0,15.5,2.5,0.0,2024-06-01,2024,6,2024-06-01,2024,6,5.133333333333334,0,Night,True,True,32.91139240506329,5.642857142857143,
2,2024-06-01T00:00:09,2024-06-01T00:05:11,1,0.74,1,N,142,239,1,6.5,1.0,0.5,2.3,0.0,1.0,13.8,2.5,0.0,2024-06-01,2024,6,2024-06-01,2024,6,5.033333333333333,0,Night,True,True,35.38461538461538,8.783783783783784,
2,2024-06-01T00:16:07,2024-06-01T00:36:14,1,1.78,1,N,48,170,1,17.7,1.0,0.5,1.0,0.0,1.0,23.7,2.5,0.0,2024-06-01,2024,6,2024-06-01,2024,6,20.116666666666667,0,Night,True,True,5.649717514124294,9.9438202247191,


In [0]:
## Create Star Schema with Proper Relationships

# FIRST, CREATE FACT TABLE WITH PROPER FOREIGN KEYS

# Ensure we have correct column names for joins
fact_table = df_trips.select(
    # Fact measures
    col("VendorID"),
    col("tpep_pickup_datetime"),
    col("tpep_dropoff_datetime"),
    col("fare_amount"),
    col("tip_amount"),
    col("trip_distance"),
    col("trip_duration_minutes"),
    col("total_amount"),
    
    # Foreign keys to dimensions
    col("PULocationID").alias("pickup_location_id"),  # FK to dim_trip_zone
    col("DOLocationID").alias("dropoff_location_id"), # FK to dim_trip_zone
    col("RatecodeID").alias("trip_type_id"),          # FK to dim_trip_type
    
    # Date dimensions
    col("trip_pickup_date"),
    col("trip_pickup_year"),
    col("trip_pickup_month"),
    col("time_of_day"),
    
    # Quality flags
    col("is_valid_fare"),
    col("is_valid_passenger_count"),
    col("data_quality_issue")
)


In [0]:
# Pure Python me
spark.sql("DROP TABLE IF EXISTS golddatabase.fact_trips")

fact_table.write.format("delta")\
    .mode("overwrite")\
    .partitionBy("trip_pickup_month")\
    .saveAsTable("golddatabase.fact_trips")

In [0]:
%sql
-- VERIFY RELATIONSHIPS CAN BE JOINED

-- Test the star schema
SELECT 
    f.*,
    z1.standardized_borough as pickup_borough,
    z2.standardized_borough as dropoff_borough,
    tt.trip_description
FROM golddatabase.fact_trips f
LEFT JOIN golddatabase.dim_trip_zone z1 ON f.pickup_location_id = z1.LocationID
LEFT JOIN golddatabase.dim_trip_zone z2 ON f.dropoff_location_id = z2.LocationID
LEFT JOIN golddatabase.dim_trip_type tt ON f.trip_type_id = tt.trip_type
LIMIT 10;

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,fare_amount,tip_amount,trip_distance,trip_duration_minutes,total_amount,pickup_location_id,dropoff_location_id,trip_type_id,trip_pickup_date,trip_pickup_year,trip_pickup_month,time_of_day,is_valid_fare,is_valid_passenger_count,data_quality_issue,pickup_borough,dropoff_borough,trip_description
2,2024-10-01T00:30:44,2024-10-01T00:48:26,18.4,1.5,3.0,17.7,24.9,162,246,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:12:20,2024-10-01T00:25:25,14.2,3.8,2.2,13.083333333333334,23.0,48,236,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:04:46,2024-10-01T00:13:52,13.5,3.7,2.7,9.1,22.2,142,24,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:12:10,2024-10-01T00:23:01,14.2,2.0,3.1,10.85,21.2,233,75,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:30:22,2024-10-01T00:30:39,3.0,0.0,0.0,0.2833333333333333,8.0,262,262,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
2,2024-10-01T00:31:20,2024-10-01T00:36:00,7.2,2.44,0.97,4.666666666666667,14.64,137,137,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:42:57,2024-10-01T00:49:01,7.9,2.55,1.3,6.066666666666666,15.45,142,48,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:59:55,2024-10-01T01:02:24,5.1,2.0,0.5,2.4833333333333334,12.1,230,161,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:00:47,2024-10-01T00:04:22,7.2,3.0,1.1,3.583333333333333,15.2,142,237,1,2024-10-01,2024,10,Night,True,False,Zero Passengers,Manhattan,Manhattan,Street-hail
1,2024-10-01T00:17:36,2024-10-01T00:26:22,11.4,3.3,2.2,8.766666666666667,19.7,162,145,1,2024-10-01,2024,10,Night,True,True,,Manhattan,Queens,Street-hail
