# Predicting Citibike Dock Availability

In [None]:
# Step 1: Load the Data
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Load dataset
data = pd.read_csv('/mnt/data/202301-citibike-tripdata_1.csv')

In [None]:
# Step 2: Explore the Data
print(data.head())
print(data.info())

In [None]:
# Step 3: Preprocess the Data
# Convert timestamps to datetime
data['started_at'] = pd.to_datetime(data['started_at'])
data['ended_at'] = pd.to_datetime(data['ended_at'])

# Extract features for analysis
data['start_hour'] = data['started_at'].dt.hour
data['day_of_week'] = data['started_at'].dt.day_name()

# Map station coordinates for reference
station_coords = data[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
station_coords.set_index('start_station_name', inplace=True)

In [None]:
# Step 4: Function to Analyze Nearby Dock Availability
def get_nearby_docks(destination_lat, destination_lng, radius=0.25):
    nearby_stations = []
    for station, coords in station_coords.iterrows():
        dist = geodesic((destination_lat, destination_lng), (coords['start_lat'], coords['start_lng'])).miles
        if dist <= radius:
            nearby_stations.append(station)
    return nearby_stations

In [None]:
# Step 5: Train a Model to Predict Dock Availability
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Group by start station and hour for dock availability analysis
dock_availability = data.groupby(['start_station_name', 'start_hour']).size().reset_index(name='trip_count')

# Prepare features and target variable
features = pd.get_dummies(dock_availability[['start_station_name', 'start_hour']], drop_first=True)
target = dock_availability['trip_count']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
print(f"Model R^2 Score: {model.score(X_test, y_test):.2f}")

In [None]:
# Step 6: Prediction Function
def predict_dock_availability(start_station, start_hour, destination_lat, destination_lng, travel_time):
    # Estimate arrival time
    arrival_hour = (start_hour + travel_time // 60) % 24

    # Get nearby stations within radius
    nearby_stations = get_nearby_docks(destination_lat, destination_lng)

    # Prepare predictions for each nearby station
    predictions = {}
    for station in nearby_stations:
        example_input = pd.DataFrame({
            'start_station_name': [station],
            'start_hour': [arrival_hour]
        })
        example_features = pd.get_dummies(example_input, drop_first=True)
        example_features = example_features.reindex(columns=features.columns, fill_value=0)

        predicted_availability = model.predict(example_features)[0]
        predictions[station] = predicted_availability

    return predictions

In [None]:
# Example Prediction Usage
start_station = 'Station A'
start_hour = 9
destination_lat, destination_lng = 40.7500, -73.9900 # Example coordinates
travel_time = 15 # in minutes

predicted_dock_availability = predict_dock_availability(start_station, start_hour, destination_lat, destination_lng, travel_time)
print("Predicted Dock Availability:", predicted_dock_availability)