In [1]:
import requests
from pathlib import Path

# URL официального CSV с полигонами Taxi Zones
url = "https://gist.githubusercontent.com/shwina/72d79165ce9605d8f6e3378ae717b16b/raw/84a47bc587c99c6736f38a97f9dcc32ba8f89b05/taxi_zones.csv"

# Куда сохраняем в Lakehouse
output_path = "/lakehouse/default/Files/bronze/taxi/taxi_zone_lookup/taxi_zones.csv"

response = requests.get(url)
response.raise_for_status()

Path(output_path).write_bytes(response.content)

print("File downloaded to:", output_path)



StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 3, Finished, Available, Finished)

File downloaded to: /lakehouse/default/Files/bronze/taxi/taxi_zone_lookup/taxi_zones.csv


In [2]:
zones_raw = spark.read.csv(
    "Files/bronze/taxi/taxi_zone_lookup/taxi_zones.csv",
    header=True,
    inferSchema=True
)

display(zones_raw.limit(5))


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d5781956-79c3-4821-8876-afddccc91db8)

In [3]:
spark.read.csv(
    "Files/bronze/taxi/taxi_zone_lookup/taxi_zones.csv",
    header=True,
    inferSchema=True
).count()


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 5, Finished, Available, Finished)

263

In [4]:
from pyspark.sql.functions import col

df_zone_centroids = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("Files/bronze/taxi/taxi_zone_lookup/taxi_zones.csv")
    .select(
        col("LocationID").alias("zone_id"),
        col("zone").alias("zone_name"),
        col("borough"),
        col("x").cast("double").alias("zone_lon"),
        col("y").cast("double").alias("zone_lat")
    )
)

df_zone_centroids.show(5)


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 6, Finished, Available, Finished)

+-------+--------------------+-------------+------------------+------------------+
|zone_id|           zone_name|      borough|          zone_lon|          zone_lat|
+-------+--------------------+-------------+------------------+------------------+
|      1|      Newark Airport|          EWR|-74.17400027276298| 40.69183120640149|
|      2|         Jamaica Bay|       Queens|-73.83129854302214| 40.61674529165988|
|      3|Allerton/Pelham G...|        Bronx|-73.84742223236718| 40.86447368477543|
|      4|       Alphabet City|    Manhattan|-73.97696825691767| 40.72375214158478|
|      5|       Arden Heights|Staten Island|-74.18848410184931|40.552659286945655|
+-------+--------------------+-------------+------------------+------------------+
only showing top 5 rows



In [33]:
(
    df_zone_centroids
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("df_zone_centroids")
)

StatementMeta(, 6863d103-d697-46d2-b000-4a485aa824f3, 40, Finished, Available, Finished)

In [5]:
spark.sql("""
SELECT * FROM df_zone_centroids LIMIT 50;
""").show()

StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 7, Finished, Available, Finished)

+-------+--------------------+-------------+------------------+------------------+
|zone_id|           zone_name|      borough|          zone_lon|          zone_lat|
+-------+--------------------+-------------+------------------+------------------+
|      1|      Newark Airport|          EWR|-74.17400027276298| 40.69183120640149|
|      2|         Jamaica Bay|       Queens|-73.83129854302214| 40.61674529165988|
|      3|Allerton/Pelham G...|        Bronx|-73.84742223236718| 40.86447368477543|
|      4|       Alphabet City|    Manhattan|-73.97696825691767| 40.72375214158478|
|      5|       Arden Heights|Staten Island|-74.18848410184931|40.552659286945655|
|      6|Arrochar/Fort Wad...|Staten Island|-74.07177055895137| 40.60032414603462|
|      7|             Astoria|       Queens|-73.91969431946073| 40.76149256216376|
|      8|        Astoria Park|       Queens| -73.9230861575075|40.778558653750714|
|      9|          Auburndale|       Queens| -73.7879488777789|40.751035025572364|
|   

In [6]:
from pyspark.sql import functions as F

def haversine(lat1, lon1, lat2, lon2):
    return (
        6371 * 2 * F.asin(
            F.sqrt(
                F.pow(F.sin((lat2 - lat1) / 2), 2) +
                F.cos(lat1) * F.cos(lat2) *
                F.pow(F.sin((lon2 - lon1) / 2), 2)
            )
        )
    )


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 8, Finished, Available, Finished)

In [7]:
df_air_locations = spark.read.table("df_air_locations")


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 9, Finished, Available, Finished)

In [8]:
df_air = (
    df_air_locations
    .withColumn("lat_rad", F.radians("lat"))
    .withColumn("lon_rad", F.radians("lon"))
)

df_zone = (
    df_zone_centroids
    .withColumn("zone_lat_rad", F.radians("zone_lat"))
    .withColumn("zone_lon_rad", F.radians("zone_lon"))
)


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 10, Finished, Available, Finished)

In [9]:
df_dist = (
    df_air.crossJoin(df_zone)
    .withColumn(
        "distance_km",
        haversine(
            F.col("lat_rad"),
            F.col("lon_rad"),
            F.col("zone_lat_rad"),
            F.col("zone_lon_rad")
        )
    )
)


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 11, Finished, Available, Finished)

In [10]:
from pyspark.sql.window import Window

w = Window.partitionBy("air_location_id").orderBy("distance_km")

df_air_with_zone = (
    df_dist
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 12, Finished, Available, Finished)

In [12]:
df_air_with_zone.select(
    "air_location_id",
    "air_location_name",
    "zone_id",
    "zone_name",
    "borough",
    "distance_km"
).show(truncate=False)


StatementMeta(, bbcdb8a1-9ed1-4146-bf70-4f1ef2b5d325, 14, Finished, Available, Finished)

+---------------+------------------+-------+---------------------------------------------+-------------+------------------+
|air_location_id|air_location_name |zone_id|zone_name                                    |borough      |distance_km       |
+---------------+------------------+-------+---------------------------------------------+-------------+------------------+
|384            |CCNY              |152    |Manhattanville                               |Manhattan    |0.5151923108993799|
|392            |Babylon           |101    |Glen Oaks                                    |Queens       |25.71569310683428 |
|625            |Manhattan/IS143   |120    |Highbridge Park                              |Manhattan    |0.3164821951255799|
|626            |Bronx - IS52      |147    |Longwood                                     |Bronx        |0.482161972568944 |
|628            |Maspeth           |157    |Maspeth                                      |Queens       |0.8266394652509503|
|631    