# Analyzing Citibike Data and Predicting Dock Availability

In [None]:
# Step 1: Load the Data
import pandas as pd

data = pd.read_csv('/mnt/data/202301-citibike-tripdata_1.csv')

In [None]:
# Step 2: Explore the Data
print(data.head())
print(data.info())

In [None]:
# Step 3: Preprocess the Data
# Convert timestamps to datetime
data['started_at'] = pd.to_datetime(data['started_at'])
data['ended_at'] = pd.to_datetime(data['ended_at'])

# Extract features for analysis
data['start_hour'] = data['started_at'].dt.hour
data['day_of_week'] = data['started_at'].dt.day_name()

In [None]:
# Step 4: Analyze Dock Availability
# Group by start station and hour
dock_availability = data.groupby(['start_station_name', 'start_hour']).size().reset_index(name='trip_count')
print(dock_availability.head())

In [None]:
# Step 5: Train a Model to Predict Dock Availability
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Prepare features and target variable
features = pd.get_dummies(dock_availability[['start_station_name', 'start_hour']], drop_first=True)
target = dock_availability['trip_count']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
print(f"Model R^2 Score: {model.score(X_test, y_test):.2f}")

In [None]:
# Step 6: Prediction Example
example_input = pd.DataFrame({
    'start_station_name': ['Station A'],
    'start_hour': [9]
})
example_features = pd.get_dummies(example_input, drop_first=True)
example_features = example_features.reindex(columns=features.columns, fill_value=0)

prediction = model.predict(example_features)
print(f"Predicted Dock Availability: {prediction[0]:.2f}")