In [9]:
import json
import pandas as pd

# Загружаем JSON
with open('data/public_transport_routes.json', encoding='utf-8') as f:
    data = json.load(f)

rows = []

# Обходим все маршруты
for line, directions in data['result'].items():
    for direction, stops in directions.items():
        for stop_number, stop_info in stops.items():
            row = {
                'line': line,
                'direction': direction,
                'stop_number': stop_number,
                **stop_info
            }
            rows.append(row)

df_bus_routes = pd.DataFrame(rows)

# Преобразуем stop_number в число
numeric_fields = ['stop_number', 'odleglosc']

for col in numeric_fields:
    df_bus_routes[col] = pd.to_numeric(df_bus_routes[col], errors='coerce')  # если вдруг где-то окажется пустота или ошибка — станет NaN

# Теперь сортировка будет корректной
df_bus_routes = df_bus_routes.sort_values(by=['line', 'direction', 'stop_number'])

df_bus_routes = df_bus_routes.rename(columns={
    'nr_zespolu': 'bus_stop_group',
    'odleglosc': 'distance',
    'nr_przystanku': 'bus_stop_platform'
})


df_bus_routes.to_parquet('data/public_transport_flat.parquet', index=False)



In [10]:
df_bus_routes.head()

Unnamed: 0,line,direction,stop_number,distance,ulica_id,bus_stop_group,typ,bus_stop_platform
47421,1,TD-1AN03,1,0,1501,R-01,6,0
47423,1,TD-1AN03,2,182,123,5004,5,3
47422,1,TD-1AN03,3,717,1703,5003,1,8
47425,1,TD-1AN03,4,938,1703,5084,1,2
47424,1,TD-1AN03,5,1213,1703,5085,1,2


In [47]:
import csv
import pandas as pd

# Read and prepare the dataframe
with open('data/timetable.csv', encoding='utf8') as f:
    reader = csv.DictReader(f)
    timetable = list(reader)

df_timetable = pd.DataFrame(timetable)
df_timetable = df_timetable.rename(columns={'next_bus_stop': 'direction'})

# Convert types
df_timetable['bus_stop_group_int'] = df_timetable['bus_stop_group'].astype(int)
df_timetable['bus_stop_platform_int'] = df_timetable['bus_stop_platform'].astype(int)
df_timetable['time_dt'] = pd.to_datetime(df_timetable['time'], format='%H:%M:%S')

# Step 1: sort by time within group
df_timetable = df_timetable.sort_values(by=['route', 'direction', 'bus_stop_group_int', 'bus_stop_platform_int', 'time_dt'])

# Step 2: get the earliest time per group to sort the groups
group_min_time = (
    df_timetable.groupby(['route', 'direction', 'bus_stop_group_int', 'bus_stop_platform_int'])['time_dt']
    .min()
    .reset_index()
    .rename(columns={'time_dt': 'group_min_time'})
)

# Step 3: merge this back to original df
df_timetable = df_timetable.merge(group_min_time, on=['route', 'direction', 'bus_stop_group_int', 'bus_stop_platform_int'])

# Step 4: sort by route, then by earliest time of the group, then by time within group
df_timetable = df_timetable.sort_values(by=['route', 'direction', 'group_min_time', 'bus_stop_group_int', 'bus_stop_platform_int', 'time_dt'])


In [26]:
import json
import pandas as pd

def parse_coords_item(item):
    row_dict = {}
    for kv in item.get("values", []):
        key = kv.get("key")
        value = kv.get("value")
        if key is not None:
            row_dict[key] = value
    return row_dict

# Читаем файл
with open('data/coordinates-of-stops.json', encoding='utf8') as f:
    data = json.load(f)

# Берём только данные внутри ключа "result"
raw_items = data.get("result", [])

# Обрабатываем
bus_stops = [parse_coords_item(elem) for elem in raw_items]

# В DataFrame
df_bus_stops = pd.DataFrame(bus_stops)

df_bus_stops = df_bus_stops.rename(columns={
    'zespol': 'bus_stop_group',
    'slupek': 'bus_stop_platform',
    'nazwa_zespolu': 'bus_stop_name',
    'id_ulicy': 'street_id',
    'szer_geo': 'latitude',
    'dlug_geo': 'longitude',
    'kierunek': 'direction',
    'obowiazuje_od': 'valid_from'
})



In [51]:
print('df_bus_routes.columns.tolist()', df_bus_routes.columns.tolist())
print(df_bus_routes.head(2).to_string())

print('df_bus_stops.columns.tolist()', df_bus_stops.columns.tolist())
print(df_bus_stops.head(2).to_string())

print('df_timetable.columns.tolist()', df_timetable.columns.tolist())
print(df_timetable.head(2).to_string())


df_bus_routes.columns.tolist() ['line', 'direction', 'stop_number', 'distance', 'ulica_id', 'bus_stop_group', 'typ', 'bus_stop_platform', 'next_ulica_id', 'next_bus_stop_group', 'next_bus_stop_platform', 'destination_ulica_id', 'destination_bus_stop_group']
  line direction  stop_number  distance ulica_id bus_stop_group typ bus_stop_platform next_ulica_id next_bus_stop_group next_bus_stop_platform destination_ulica_id destination_bus_stop_group
0    1  TD-1AN03            1         0     1501           R-01   6                00          0123                5004                     03                 0104                       1087
1    1  TD-1AN03            2       182     0123           5004   5                03          1703                5003                     08                 0104                       1087
df_bus_stops.columns.tolist() ['bus_stop_group', 'bus_stop_platform', 'bus_stop_name', 'street_id', 'latitude', 'longitude', 'direction', 'valid_from']
  bus_stop_group 

In [49]:

# For each group (same line and direction), shift the ulica_id, bus_stop_group, and bus_stop_platform columns by -1
df_bus_routes['next_ulica_id'] = df_bus_routes.groupby(['line', 'direction'])['ulica_id'].shift(-1)
df_bus_routes['next_bus_stop_group'] = df_bus_routes.groupby(['line', 'direction'])['bus_stop_group'].shift(-1)
df_bus_routes['next_bus_stop_platform'] = df_bus_routes.groupby(['line', 'direction'])['bus_stop_platform'].shift(-1)


# Step 2: Compute destination info (last stop in the group)
destination_info = (
    df_bus_routes
    .sort_values('stop_number')  # ensure ordering
    .groupby(['line', 'direction'], as_index=False)
    .last()[['line', 'direction', 'ulica_id', 'bus_stop_group']]
    .rename(columns={
        'ulica_id': 'destination_ulica_id',
        'bus_stop_group': 'destination_bus_stop_group'
    })
)

# Step 3: Merge destination info back into full table
df_bus_routes = df_bus_routes.merge(destination_info, on=['line', 'direction'], how='left')

# Step 4: Drop rows where the next stop data is missing (typically the last stop)
df_bus_routes_extended = df_bus_routes.dropna(subset=['next_ulica_id', 'next_bus_stop_group', 'next_bus_stop_platform'])

# Step 5: Reorder/select columns
df_bus_routes_extended = df_bus_routes_extended[[
    'line', 'direction',
    'ulica_id', 'bus_stop_group', 'bus_stop_platform',
    'next_ulica_id', 'next_bus_stop_group', 'next_bus_stop_platform',
    'destination_ulica_id', 'destination_bus_stop_group'
]]



In [48]:
unique_directions_per_route = df_timetable.groupby('route')['direction'].apply(lambda x: sorted(set(x)))


In [None]:
def parse_transport_dictionary(item):
    row_dict = {}
    for kv in item.get("values", []):
        key = kv.get("key")
        value = kv.get("value")
        if key is not None:
            row_dict[key] = value
    return row_dict

# Читаем файл
with open('data/public_transport_dictionary.json', encoding='utf8') as f:
    data = json.load(f)

# Берём только данные внутри ключа "result"
raw_items = data.get("result", [])

# Обрабатываем
bus_stops = [parse_coords_item(elem) for elem in raw_items]

# В DataFrame
df_bus_stops = pd.DataFrame(bus_stops)