#### NYC DOT Real-Time Traffic Speed Data

- [Kaggle](https://www.kaggle.com/datasets/aadimator/nyc-realtime-traffic-speed-data/data)
- [Web Archive](https://web.archive.org/web/20221006005747/https://data.cityofnewyork.us/Transportation/Real-Time-Traffic-Speed-Data/qkm5-nuaq)

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['font.size'] = 12
plt.rcParams['savefig.format'] = 'pdf'
sns.set_style('darkgrid')

In [2]:
import polars as pl
import networkx as nx
from tqdm import tqdm

pl.Config.set_streaming_chunk_size(10_000)


# df = pl.read_parquet('DOT_Traffic_Speeds_NBE.parquet')  # Eager execution
df = pl.scan_parquet('DOT_Traffic_Speeds_NBE.parquet')  # Lazy execution

def get_lazy_shape(lf):
    rows = lf.select(pl.len()).collect()[0,0]
    cols = len(lf.collect_schema().names())
    return (rows, cols)

print(get_lazy_shape(df))
df.collect().head()

(64914523, 11)


ID,SPEED,TRAVEL_TIME,DATA_AS_OF,LINK_ID,LINK_POINTS,ENCODED_POLY_LINE,ENCODED_POLY_LINE_LVLS,OWNER,BOROUGH,LINK_NAME
i32,f32,f32,datetime[ns],i32,str,str,str,cat,cat,str
262,34.799999,359.0,2017-06-02 23:41:59,4616319,"""40.6332305,-74.016151 40.63391…","""ud_wF|gwbMgCCwATcBr@_BvAqDhGmG…","""BBBBBBBBBBBBBBBBBBBBBBBBBBBB""","""NYC_DOT_LIC""","""Brooklyn""","""GOW S 9TH STREET - 7TH AVENUE"""
204,55.919998,155.0,2017-06-02 23:41:59,4616320,"""40.7894406,-73.786291 40.7891…","""_u}wFhkjaMr@dI~A~HtA|EbEnKxBdH…","""BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB…","""NYC_DOT_LIC""","""Queens""","""CIP N TNB - Whitestone Expwy S…"
106,39.77,159.0,2017-06-02 23:41:59,4616323,"""40.77158,-73.994441 40.7713004…","""kezwFf`sbMv@TxAVnDZe@Gz@J~@Xf@…","""BBBBBBBBBBBBBBBBB""","""NYC_DOT_LIC""","""Manhattan""","""12th Ave S 57th St - 45th St"""
184,65.239998,39.0,2017-06-03 04:46:59,4616253,"""40.8347204,-73.86593 40.83357,…","""_pfxF`}yaMdFsWfDmPpH}^lEgTBBBB…","""BBBBB""","""NYC_DOT_LIC""","""Bronx""","""CBE E TAYLOR AVENUE - CASTLE H…"
3,14.91,422.0,2017-06-02 23:41:59,4616324,"""40.76375,-73.999191 40.763521,…","""mtxwF|}sbMl@^~GpK|LrIbLlH??lK~…","""BBBBBBBBBBBBBBB""","""NYC_DOT_LIC""","""Manhattan""","""12th ave @ 45th - 11 ave ganse…"


In [3]:
df_links = (
    df
    .sort('DATA_AS_OF', descending=True)
    .unique(subset=['LINK_ID'], keep='first')
).collect()

len(df_links)

153

In [4]:
# Используем регулярное выражение для извлечения пар координат
df_links = df_links.with_columns([
    pl.col('LINK_POINTS')
    .str.extract_all(r"(-?\d+\.\d+),(-?\d+\.\d+)")
    .list.eval(pl.element().str.split(","))
    .cast(pl.List(pl.List(pl.Float64)))
    .alias('coord_pairs')
])

df_links.select(pl.col('coord_pairs')).head()

coord_pairs
list[list[f64]]
"[[40.85526, -73.918591], [40.85266, -73.92085], … [40.844511, -73.92]]"
"[[40.827161, -73.84993], [40.82771, -73.84671], … [40.834301, -73.82571]]"
"[[40.84064, -73.83831], [40.83881, -73.83853], … [40.82495, -73.836211]]"
"[[40.83472, -73.86593], [40.83357, -73.86199], … [40.830111, -73.850731]]"
"[[40.77223, -73.919941], [40.77367, -73.92198], … [40.78972, -73.926]]"


In [5]:
df_links[1]['coord_pairs'].item()

"[40.827161, -73.84993]"
"[40.82771, -73.84671]"
"[40.82841, -73.843471]"
"[40.82869, -73.84133]"
"[40.82879, -73.8386]"
…
"[40.8305, -73.83239]"
"[40.83211, -73.82983]"
"[40.83305, -73.82826]"
"[40.83366, -73.82693]"
"[40.834301, -73.82571]"


In [None]:
df_links['coord_pairs']

coord_pairs
list[list[f64]]
"[[40.85526, -73.918591], [40.85266, -73.92085], … [40.844511, -73.92]]"
"[[40.827161, -73.84993], [40.82771, -73.84671], … [40.834301, -73.82571]]"
"[[40.84064, -73.83831], [40.83881, -73.83853], … [40.82495, -73.836211]]"
"[[40.83472, -73.86593], [40.83357, -73.86199], … [40.830111, -73.850731]]"
"[[40.77223, -73.919941], [40.77367, -73.92198], … [40.78972, -73.926]]"
…
"[[40.683644, -73.72667], [40.68314, -73.72692], … [40.66749, -73.739]]"
"[[40.69153, -73.99911], [40.692261, -73.99937], … [40.70079, -73.994881]]"
"[[40.762601, -73.839671], [40.761391, -73.83898], … [40.749841, -73.835]]"
"[[40.71772, -73.94831], [40.71862, -73.946851], … [40.72612, -73.93]]"


In [27]:
import folium
import math

m = folium.Map(location=[40.763521, -73.99935])


def filter_outliers(coords, threshold=0.01):
    if len(coords) < 2:
        return coords.copy()
    
    filtered = []
    for i in range(len(coords)-1):
        dist = math.dist(coords[i], coords[i+1])
        if dist < threshold:
            filtered.append(coords[i])
    
    filtered.append(coords[-1])  # Всегда добавляем последнюю точку
    return filtered

i = 0
for line_coordinates in df_links['coord_pairs']:
    line_coordinates = filter_outliers(line_coordinates)
    line = folium.PolyLine(locations=line_coordinates, color='blue', weight=5, opacity=0.8)
    line.add_to(m)
    # if i > 145:
    #     break
    i += 1
m

In [25]:
df_links[146]['coord_pairs'].to_list()

[[[40.6756, -74.841],
  [40.67643, -74.001241],
  [40.6772405, -74.001741],
  [40.6783605, -74.002531],
  [40.67925, -74.00331],
  [40.6795705, -74.00341],
  [40.68001, -74.00332],
  [40.6805806, -74.003031],
  [40.6814704, -74.0026],
  [40.68522, -74.76],
  [40.6872004, -73.999811],
  [40.6892206, -73.99885],
  [40.68972, -73.99]]]