<a href="https://colab.research.google.com/github/vinechai/nextbike/blob/main/nextbike_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p data


In [4]:
import requests
import pandas as pd
from datetime import datetime, timezone
from pathlib import Path

# ======================
# Configuration
# ======================

PRAGUE_CITY_ID = 661                     # confirmed from JSON
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

LIVE_DATA_URL = "https://maps.nextbike.net/maps/nextbike-live.flatjson"


# ======================
# Helper Functions
# ======================

def append_parquet(df, path):
    """
    Append a DataFrame to a Parquet file.
    If the file does not exist yet, create it.
    """
    if path.exists():
        old = pd.read_parquet(path)
        df = pd.concat([old, df], ignore_index=True)
    df.to_parquet(path, index=False)


# ======================
# Main Scraper Logic
# ======================

def scrape_prague_once():
    # 1. Fetch live Nextbike JSON
    resp = requests.get(LIVE_DATA_URL)
    resp.raise_for_status()
    data = resp.json()

    # Create timestamp for this scrape
    scrape_time = datetime.now(timezone.utc)

    # -------------------------------------------------
    # 2. Extract STATIONS for Prague
    # -------------------------------------------------
    stations = [place for place in data["places"] if place.get("city_id") == PRAGUE_CITY_ID]

    stations_df = pd.json_normalize(stations)
    stations_df["scrape_time"] = scrape_time

    # Save latest + append to history
    stations_df.to_parquet(DATA_DIR / "stations_latest.parquet", index=False)
    append_parquet(stations_df, DATA_DIR / "stations_history.parquet")

    # -------------------------------------------------
    # 3. Extract BIKES for Prague
    # -------------------------------------------------
    bikes = [bike for bike in data["bikes"] if bike.get("city_id") == PRAGUE_CITY_ID]

    bikes_df = pd.json_normalize(bikes)
    bikes_df["scrape_time"] = scrape_time

    # Save latest + append to history
    bikes_df.to_parquet(DATA_DIR / "bikes_latest.parquet", index=False)
    append_parquet(bikes_df, DATA_DIR / "bikes_history.parquet")

    # -------------------------------------------------
    # 4. Print for visibility
    # -------------------------------------------------
    print(f"[OK] Scraped Prague at {scrape_time}")
    print(f"     Stations: {len(stations)}")
    print(f"     Bikes: {len(bikes)}")


# Run once when executed directly
if __name__ == "__main__":
    scrape_prague_once()


[OK] Scraped Prague at 2025-11-15 23:37:11.959679+00:00
     Stations: 1109
     Bikes: 0


In [14]:
import requests
import pandas as pd
from datetime import datetime, timezone
from pathlib import Path

PRAGUE_CITY_ID = 661
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

LIVE_DATA_URL = "https://maps.nextbike.net/maps/nextbike-live.flatjson"


def append_parquet(df, path):
    if path.exists():
        old = pd.read_parquet(path)
        df = pd.concat([old, df], ignore_index=True)
    df.to_parquet(path, index=False)


def scrape_prague_once():
    resp = requests.get(LIVE_DATA_URL)
    resp.raise_for_status()
    data = resp.json()

    scrape_time = datetime.now(timezone.utc)

    # -------------------------------------------
    # 1. STATIONS (FROM places)
    # -------------------------------------------
    stations = [
        place for place in data["places"]
        if place.get("city_id") == PRAGUE_CITY_ID
    ]

    stations_df = pd.json_normalize(stations)
    stations_df["scrape_time"] = scrape_time

    stations_df.to_parquet(DATA_DIR / "stations_latest.parquet", index=False)

    # -------------------------------------------
    # 2. BIKES (EXTRACTED FROM stations)
    # -------------------------------------------
    bike_rows = []

    for st in stations:
        bike_numbers = st.get("bike_numbers")
        if not bike_numbers:
            continue

        # convert "485396,481489" → ["485396", "481489"]
        bikes = [b.strip() for b in bike_numbers.split(",")]

        for bike_id in bikes:
            bike_rows.append({
                "bike_id": bike_id,
                "station_uid": st["uid"],
                "station_name": st["name"],
                "lat": st["lat"],
                "lng": st["lng"],
                "scrape_time": scrape_time
            })

    bikes_df = pd.DataFrame(bike_rows)

    bikes_df.to_parquet(DATA_DIR / "bikes_latest.parquet", index=False)
    append_parquet(bikes_df, DATA_DIR / "bikes_history.parquet")

    print(f"[OK] Scraped Prague at {scrape_time}")
    print(f"     Stations: {len(stations)}")
    print(f"     Bikes: {len(bikes_df)}")


if __name__ == "__main__":
    scrape_prague_once()


[OK] Scraped Prague at 2025-11-16 00:09:50.389425+00:00
     Stations: 1108
     Bikes: 1237


In [16]:
import pandas as pd

df = pd.read_parquet("data/bikes_latest.parquet")
df.head()


Unnamed: 0,bike_id,station_uid,station_name,lat,lng,scrape_time
0,485212,27581946,P10-Čechovo náměstí,50.06821,14.45876,2025-11-16 00:09:50.389425+00:00
1,483482,27581946,P10-Čechovo náměstí,50.06821,14.45876,2025-11-16 00:09:50.389425+00:00
2,485396,27581959,P10-Moskevská 2 - AIRBANK,50.06916,14.45526,2025-11-16 00:09:50.389425+00:00
3,481489,27581959,P10-Moskevská 2 - AIRBANK,50.06916,14.45526,2025-11-16 00:09:50.389425+00:00
4,481264,27581959,P10-Moskevská 2 - AIRBANK,50.06916,14.45526,2025-11-16 00:09:50.389425+00:00


In [8]:
import pandas as pd

pd.read_parquet("data/stations_latest.parquet").head()


Unnamed: 0,uid,lat,lng,bike,name,address,spot,number,booked_bikes,bikes,...,special_racks,free_special_racks,maintenance,terminal_type,bike_numbers,bike_types,place_type,rack_locks,city_id,scrape_time
0,27581946,50.06821,14.45876,False,P10-Čechovo náměstí,,True,46007,0,2,...,0,0,False,sign,485212483482,"{""200"":2}",0,False,661,2025-11-15 23:37:11.959679+00:00
1,27581959,50.06916,14.45526,False,P10-Moskevská 2 - AIRBANK,,True,46009,0,4,...,0,0,False,sign,485396481489481264487959,"{""200"":3,""254"":1}",0,False,661,2025-11-15 23:37:11.959679+00:00
2,27582060,50.0691,14.45338,False,P10-Vršovické náměstí - REST. WAIKIKI,,True,46010,0,1,...,0,0,False,sign,485124,"{""200"":1}",0,False,661,2025-11-15 23:37:11.959679+00:00
3,27582066,50.07011,14.4514,False,P10 - Moskevská U TRAMTARIE,,True,46011,0,0,...,0,0,False,sign,,{},0,False,661,2025-11-15 23:37:11.959679+00:00
4,27582078,50.07118,14.4567,False,P10-Tesco Finská,,True,46012,0,0,...,0,0,False,sign,,{},0,False,661,2025-11-15 23:37:11.959679+00:00


In [13]:
pd.read_parquet("data/stations_history.parquet").head()
stations_history = pd.read_parquet("data/stations_history.parquet")
print(stations_history.shape)

(1109, 24)


In [12]:
pd.read_parquet("data/bikes_latest.parquet").head()


Unnamed: 0,scrape_time
