## Data Access

In [0]:
spark.conf.set("fs.azure.account.auth.type..dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type..dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id..dfs.core.windows.net", "")
spark.conf.set("fs.azure.account.oauth2.client.secret..dfs.core.windows.net", "")
spark.conf.set("fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net", "https://login.microsoftonline.com//oauth2/token")
# Azure OAuth configuration - Credentials stored securely in Databricks secrets
# Actual client_id, client_secret and tenant_id removed for security

In [0]:
dbutils.fs.ls('abfss://bronze@nyctaxidataproject1.dfs.core.windows.net')

## Data Reading

**Importing Libraries**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Reading CSV Data

**Trip Type Data**

In [0]:
df_trip_type = spark.read.format("csv")\
                    .option("header",True)\
                    .option("inferSchema",True)\
                    .load("abfss://bronze@nyctaxidataproject1.dfs.core.windows.net/trip_type")

In [0]:
df_trip_type.display()

**Trip Zone Data**

In [0]:
df_trip_zone = spark.read.format("csv")\
                    .option("header",True)\
                    .option("inferSchema",True)\
                    .load("abfss://bronze@nyctaxidataproject1.dfs.core.windows.net/trip_zone")

In [0]:
df_trip_zone.display()

**Trip Data**

In [0]:
df_trip = spark.read.format("parquet")\
               .option("inferSchema",True)\
               .option("header",True)\
               .option("recursiveFileLookup",True)\
                .load("abfss://bronze@nyctaxidataproject1.dfs.core.windows.net/tripsdata_2024")
               

In [0]:
df_trip.display()

## Data Transformation

**Taxi trip type**

In [0]:
df_trip_type.display()

In [0]:
df_trip_type = df_trip_type.withColumnRenamed('description','trip_description')
df_trip_type.display()

In [0]:
df_trip_type.write.format('parquet')\
                  .mode('append')\
                  .option("path","abfss://silver@nyctaxidataproject1.dfs.core.windows.net/trip_type")\
                  .save()

**Trip Zone**

In [0]:
df_trip_zone.display()

In [0]:
df_trip_zone = df_trip_zone.withColumn('zone1', split(col('Zone'), '/')[0])\
                           .withColumn('zone2', split(col('Zone'), '/')[1])
df_trip_zone.display()


In [0]:
df_trip_zone.write.format('parquet')\
                  .mode('append')\
                  .option("path","abfss://silver@nyctaxidataproject1.dfs.core.windows.net/trip_zone")\
                  .save()

**Trip Data**

In [0]:
df_trip.display()

In [0]:
df_trip = df_trip.withColumn('trip_pickup_date',to_date(col('tpep_pickup_datetime')))\
                 .withColumn('trip_pickup_year',year(col('tpep_pickup_datetime')))\
                 .withColumn('trip_pickup_month',month(col('tpep_pickup_datetime')))
                 
                 

In [0]:
df_trip.display()

In [0]:
df_trip = df_trip.withColumn('trip_dropoff_date',to_date(col('tpep_dropoff_datetime')))\
                 .withColumn('trip_dropoff_year',year(col('tpep_dropoff_datetime')))\
                 .withColumn('trip_dropoff_month',month(col('tpep_dropoff_datetime')))
                 
                 

In [0]:
df_trip.display()

In [0]:
df_trip.write.format('parquet')\
                  .mode('append')\
                  .option("path","abfss://silver@nyctaxidataproject1.dfs.core.windows.net/trips2024data")\
                  .save()

##Analysis

In [0]:
display(df_trip)