# Geospatial plotting with Kepler.gl

#### 1. Import libraries & data

In [1]:
# Import libraries

import pandas as pd
import os
from keplergl import KeplerGl


In [2]:
# Quick file check
file_size = os.path.getsize(r"C:\Users\magia\OneDrive\Desktop\NY_Citi_Bike\2.Data\Prepared Data\nyc_2022_essential_data.csv") # Streamlit-Access: Check Google Drive Link for file
print(f"File size: {file_size / 1024**3:.2f} GB")

File size: 4.30 GB


In [3]:
# Load only the essential columns for Kepler OD map
essential_cols = ['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']

df = pd.read_csv(r"C:\Users\magia\OneDrive\Desktop\NY_Citi_Bike\2.Data\Prepared Data\nyc_2022_essential_data.csv",
                 usecols=essential_cols,
                 low_memory=False)

#### 2. Data Preparation – aggregate trips into start–end flows and station usage

In [4]:
#Get station counts and find stations for 90% coverage
start_counts = df['start_station_name'].value_counts()
end_counts = df['end_station_name'].value_counts()
total_trips = len(df)

# Find how many stations needed for 90% coverage
cumsum_start = start_counts.cumsum() / total_trips
cumsum_end = end_counts.cumsum() / total_trips

stations_for_90_start = (cumsum_start <= 0.9).sum()
stations_for_90_end = (cumsum_end <= 0.9).sum()

In [5]:
# Get the actual station names for top 90% coverage
top_90_start = start_counts.head(stations_for_90_start).index.tolist()
top_90_end = end_counts.head(stations_for_90_end).index.tolist()

print(f"Using {stations_for_90_start} start stations and {stations_for_90_end} end stations for 90% coverage")

Using 871 start stations and 876 end stations for 90% coverage


In [6]:
# Filter dataframe to only include these stations
df_filtered = df[
    (df['start_station_name'].isin(top_90_start)) & 
    (df['end_station_name'].isin(top_90_end))
]

print(f"Filtered from {len(df):,} to {len(df_filtered):,} trips ({len(df_filtered)/len(df)*100:.1f}%)")


Filtered from 29,838,166 to 25,469,099 trips (85.4%)


In [7]:
# Process the groupby in chunks to avoid memory issues
def create_od_chunked(df, chunk_size=5_000_000):
    od_list = []
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        od_chunk = (
            chunk
            .groupby(['start_station_name','end_station_name'])
            .size()                                  # counts rows
        )
        od_list.append(od_chunk)
    return pd.concat(od_list).groupby(level=[0,1]).sum()

In [8]:
# Create Origin-Destination(OD) matrix with filtered data
print("Creating OD matrix in chunks...")
od_series = create_od_chunked(df_filtered)

# Convert to DataFrame and sort
df_od = od_series.reset_index(name="trip_count").sort_values("trip_count", ascending=False)

print(f"OD matrix created with {len(df_od):,} origin-destination pairs")
df_od.head(10)

Creating OD matrix in chunks...
OD matrix created with 543,555 origin-destination pairs


Unnamed: 0,start_station_name,end_station_name,trip_count
132390,Central Park S & 6 Ave,Central Park S & 6 Ave,12041
53800,7 Ave & Central Park South,7 Ave & Central Park South,8541
408924,Roosevelt Island Tramway,Roosevelt Island Tramway,8213
286778,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287
418206,Soissons Landing,Soissons Landing,7275
466976,W 21 St & 6 Ave,9 Ave & W 22 St,6345
43490,5 Ave & E 72 St,5 Ave & E 72 St,6037
5043,1 Ave & E 62 St,1 Ave & E 68 St,5826
543554,Yankee Ferry Terminal,Yankee Ferry Terminal,5759
114959,Broadway & W 58 St,Broadway & W 58 St,5509


In [9]:
# Get unique coordinates per station
start_coords = df_filtered[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates('start_station_name')
end_coords = df_filtered[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates('end_station_name')

# Add coordinates to df_od
df_od = df_od.merge(start_coords, on='start_station_name', how='left')
df_od = df_od.merge(end_coords, on='end_station_name', how='left')

In [10]:
# rename for Kepler flows
df_flows = df_od.rename(columns={'trip_count': 'trips'})

In [11]:
# Create stations data directly from df_filtered
stations = (
    df_filtered.groupby(['start_station_name', 'start_lat', 'start_lng'])
    .size()
    .reset_index(name='starts')
    .rename(columns={'start_station_name': 'station', 'start_lat': 'lat', 'start_lng': 'lng'})
)

ends_data = (
    df_filtered.groupby(['end_station_name', 'end_lat', 'end_lng'])
    .size()
    .reset_index(name='ends') 
    .rename(columns={'end_station_name': 'station', 'end_lat': 'lat', 'end_lng': 'lng'})
)

stations = stations.merge(ends_data, on=['station', 'lat', 'lng'], how='outer').fillna(0)
stations['total_trips'] = stations['starts'] + stations['ends']

In [12]:
stations[['starts','ends']] = stations[['starts','ends']].astype('int64')
stations['total_trips'] = stations['total_trips'].astype('int64')

### 3. Kepler Map

In [13]:
# See the distribution of flows
print(df_od['trip_count'].describe())

count    543555.000000
mean         46.856526
std         129.937788
min           1.000000
25%           2.000000
50%           8.000000
75%          36.000000
max       12041.000000
Name: trip_count, dtype: float64


- On average, OD flows have 47 trips, with a max over 12,000 and min of 1. This explains the high standard deviation of 130. If we put every flow on the map, the result would not be intelligible.
- Over half of flows consist of 8 trips or less, so we can confidently filter these out and still capture the important OD patterns.

In [14]:
print(f"Flows with 500+ trips: {(df_od['trip_count'] >= 500).sum()}")
print(f"Flows with 1000+ trips: {(df_od['trip_count'] >= 1000).sum()}")
print(f"Flows with 1500+ trips: {(df_od['trip_count'] >= 1500).sum()}")
print(f"Flows with 2000+ trips: {(df_od['trip_count'] >= 2000).sum()}")

Flows with 500+ trips: 6776
Flows with 1000+ trips: 1466
Flows with 1500+ trips: 499
Flows with 2000+ trips: 226


- After testing various filters, we set the threshold to include only ***flows with 1500+ trips***. This filters our data down to ***499 flows***, which creates a more readable visualization.

In [15]:
# center map on data coordinates
center_lat = stations['lat'].mean()
center_lon = stations['lng'].mean()

# color scheme (flare-like palette)
flare_like = {
    "name": "flare_like",
    "type": "sequential",
    "category": "Uber",
    "colors": ["#2D1E3E", "#6B1F73", "#A22C7E", "#D6456C", "#F77C48", "#FDBD3C"]
}

# Kepler config
cfg = {
    "version": "v1",
    "config": {
        "visState": {
            "filters": [
                {
                    "dataId": "Flows",
                    "id": "trips_filter",
                    "name": ["trips"],
                    "type": "range",
                    "value": [1500, int(df_flows["trips"].max())],
                    "enlarged": True
                }
            ],
            "layers": [
                {
                    "id": "stations-point",
                    "type": "point",
                    "config": {
                        "dataId": "Stations",
                        "label": "Stations",
                        "columns": {"lat": "lat", "lng": "lng"},
                        "isVisible": True,
                        "visConfig": {
                            "radius": 4,
                            "colorRange": flare_like
                        }
                    },
                    "visualChannels": {
                        "colorField": {"name": "total_trips", "type": "integer"},
                        "colorScale": "quantile",
                        "sizeField": {"name": "total_trips", "type": "integer"},
                        "sizeScale": "sqrt"
                    }
                },
                {
                    "id": "flows-arc",
                    "type": "arc",
                    "config": {
                        "dataId": "Flows",
                        "label": "OD Flows",
                        "columns": {
                            "lat0": "start_lat", "lng0": "start_lng",
                            "lat1": "end_lat", "lng1": "end_lng"
                        },
                        "isVisible": True,
                        "visConfig": {
                            "thickness": 4,
                            "opacity": 0.7,
                            "colorRange": flare_like
                        }
                    },
                    "visualChannels": {
                        "sizeField": {"name": "trips", "type": "integer"},
                        "sizeScale": "sqrt",
                        "colorField": {"name": "trips", "type": "integer"},
                        "colorScale": "quantile"
                    }
                }
            ]
        },
        "mapState": {
            "latitude": float(center_lat),
            "longitude": float(center_lon),
            "zoom": 12
        }
    }
}

In [16]:
import json

m = KeplerGl(height=650, config=cfg)
m.add_data(stations, "Stations")
m.add_data(df_flows, "Flows")
m.save_to_html(file_name="new_york_citi_bike_map.html", config=cfg, read_only=False)

# Just save the complete map
full_map_data = {
    'version': 'v1',
    'config': m.config,
    'data': m.data
}

with open("kepler_map_with_data.json", "w") as f:
    json.dump(full_map_data, f)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to new_york_citi_bike_map.html!


***I used a warm sequential palette (dark→light, close to the flare palette used in the previous task) for both stations and arcs so that higher volumes stand out intuitively. Station size & color encode total trips, while arc thickness & color encode trip counts between station pairs. I added a range filter on trips (default ≥ 1500) so the initial view isn’t cluttered. The map centers over the mean lat/lon of all stations***.

In [17]:
# Filter df_flows to only flows with 1500+ trips
df_flows_filtered = df_flows[df_flows['trips'] >= 1500].copy()

# Get unique stations that appear in the filtered flows
stations_in_flows = set(df_flows_filtered['start_station_name'].unique()) | set(df_flows_filtered['end_station_name'].unique())

# Filter stations to only those that appear in the filtered flows
stations_filtered = stations[stations['station'].isin(stations_in_flows)].copy()

# Save the filtered versions
stations_filtered.to_csv(
    r"C:\Users\magia\OneDrive\Desktop\NY_Citi_Bike\2.Data\Prepared Data\citibike_2022_stations_high_flow.csv",
    index=False
)

df_flows_filtered.to_csv(
    r"C:\Users\magia\OneDrive\Desktop\NY_Citi_Bike\2.Data\Prepared Data\citibike_2022_flows_1500plus.csv",
    index=False
)

print(f"Filtered flows: {len(df_flows_filtered):,} rows (from {len(df_flows):,})")
print(f"Filtered stations: {len(stations_filtered):,} rows (from {len(stations):,})")
print(f"Unique stations in flows: {len(stations_in_flows)}")

Filtered flows: 499 rows (from 543,555)
Filtered stations: 845,119 rows (from 1,873,598)
Unique stations in flows: 277
