In [421]:
import pandas as pd
import os
import numpy as np
import pygeohash as pgh
import plotly.express as px

In [422]:
os.chdir('C:/Users/wissam_T/Desktop/5th/2nd semester/dm/h.w 1/project')

In [423]:
from project.functions import helper_functions as hf

In [424]:
df_merged = pd.read_csv('datasets/Merged_trips_with_stations.csv')

In [425]:
df_merged.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'member_casual', 'duration_min', 'geometry_start',
       'geometry_end', 'Zone_start_zone_name', 'Zone_start_GIS_ID',
       'Zone_start_OBJECTID', 'Zone_start_geometry', 'Zone_end_zone_name',
       'Zone_end_GIS_ID', 'Zone_end_OBJECTID', 'Zone_end_geometry', 'date',
       'Weather_temp', 'Weather_windspeedmean', 'Weather_sunrise',
       'Weather_sunset', 'Weather_conditions', 'Bikeshare_NAME_start',
       'Bikeshare_STATION_TYPE_start', 'Bikeshare_CAPACITY_start',
       'Bikeshare_REGION_ID_start', 'Bikeshare_REGION_NAME_start',
       'Bikeshare_geometry_start', 'Bikeshare_NAME_end',
       'Bikeshare_STATION_TYPE_end', 'Bikeshare_CAPACITY_end',
       'Bikeshare_REGION_ID_end', 'Bikeshare_REGION_NAME_end',
       'Bikeshare_geometry_end', 'StartStation_NAME', 'EndStation_NAME'],
      dtype='object')

In [426]:
print(df_merged['Bikeshare_CAPACITY_start'].isna().sum())
print(df_merged['Bikeshare_CAPACITY_end'].isna().sum())

0
28


In [427]:
df_merged.shape

(97, 40)

In [428]:
df_merged_copy = df_merged.copy()

In [429]:
# First, remove the "POINT (" and ")" parts, then split
df_modified = pd.DataFrame(df_merged_copy)
df_modified['start_longitude'] = df_modified['Bikeshare_geometry_start'].str.extract(r'POINT \((-?\d+\.\d+)')[0].astype(float)
df_modified['start_latitude'] = df_modified['Bikeshare_geometry_start'].str.extract(r'POINT \(-?\d+\.\d+ (\d+\.\d+)\)')[0].astype(float)

df_modified['end_longitude'] = df_modified['Bikeshare_geometry_end'].str.extract(r'POINT \((-?\d+\.\d+)')[0].astype(float)
df_modified['end_latitude'] = df_modified['Bikeshare_geometry_end'].str.extract(r'POINT \(-?\d+\.\d+ (\d+\.\d+)\)')[0].astype(float)


In [430]:
df_modified['rounded_start_lat'] = df_modified['start_latitude'].round(5)
df_modified['rounded_start_lon'] = df_modified['start_longitude'].round(5)

ref_from_start = df_modified[
    df_modified['end_station_name'].notna() &
    df_modified['start_latitude'].notna() &
    df_modified['start_longitude'].notna()
    ][
    ['rounded_start_lat', 'rounded_start_lon', 
     'end_station_name', 'end_station_id',
     'end_latitude', 'end_longitude',
     'Bikeshare_NAME_end', 'Bikeshare_STATION_TYPE_end',
     'Bikeshare_CAPACITY_end', 'Bikeshare_REGION_ID_end',
     'Bikeshare_REGION_NAME_end', 'Bikeshare_geometry_end',
     'Zone_end_zone_name', 'Zone_end_GIS_ID',
     'Zone_end_OBJECTID', 'Zone_end_geometry',
     'EndStation_NAME']
].drop_duplicates(subset=['rounded_start_lat', 'rounded_start_lon'])

ref_from_start.set_index(['rounded_start_lat', 'rounded_start_lon'], inplace=True)

end_cols_to_fill = [
    'end_station_name', 'end_station_id',
    'end_latitude', 'end_longitude',
    'Bikeshare_NAME_end', 'Bikeshare_STATION_TYPE_end',
    'Bikeshare_CAPACITY_end', 'Bikeshare_REGION_ID_end',
    'Bikeshare_REGION_NAME_end', 'Bikeshare_geometry_end',
    'Zone_end_zone_name', 'Zone_end_GIS_ID',
    'Zone_end_OBJECTID', 'Zone_end_geometry',
    'EndStation_NAME'
]

for col in end_cols_to_fill:
    df_modified[col] = df_modified.apply(lambda row: hf.fill_from_start_coords(row, col, ref_from_start), axis=1)

df_modified.drop(columns=['rounded_start_lat', 'rounded_start_lon'], inplace=True)



In [431]:
def assign_size_category(cap):
    if cap <= low_threshold:
        return 'Small'
    elif cap <= high_threshold:
        return 'Medium'
    else:
        return 'Large'

In [432]:
start_stations = df_merged[['start_station_id', 'Bikeshare_CAPACITY_start']].rename(
    columns={'start_station_id': 'station_id', 'Bikeshare_CAPACITY_start': 'capacity'}
)

end_stations = df_merged[['end_station_id', 'Bikeshare_CAPACITY_end']].rename(
    columns={'end_station_id': 'station_id', 'Bikeshare_CAPACITY_end': 'capacity'}
)

# Combine both into one list of stations
stations = pd.concat([start_stations, end_stations])

# Remove nulls and duplicates (in case some stations appear twice)
stations = stations.dropna(subset=['capacity'])
stations = stations.drop_duplicates(subset='station_id')
stations['station_size'] = stations['capacity'].apply(assign_size_category)

df_modified = df_modified.drop(columns=['start_station_size', 'end_station_size'], errors='ignore')

low_threshold = np.percentile(stations['capacity'], 33)
high_threshold = np.percentile(stations['capacity'], 66)

# Step 4: Apply the function to create the new column
stations['station_size'] = stations['capacity'].apply(assign_size_category)

df_modified = df_modified.merge(
    stations[['station_id', 'station_size']],
    left_on='start_station_id',
    right_on='station_id',
    how='left'
).rename(columns={'station_size': 'start_station_size'}).drop(columns=['station_id'])

# Merge end station sizes
df_modified = df_modified.merge(
    stations[['station_id', 'station_size']],
    left_on='end_station_id',
    right_on='station_id',
    how='left'
).rename(columns={'station_size': 'end_station_size'}).drop(columns=['station_id'])


# Now `stations` has: station_id, capacity, and station_size
print(stations.head())



    station_id  capacity station_size
0      31519.0      15.0        Small
7      31651.0      19.0       Medium
9      31939.0      17.0       Medium
10     30200.0      27.0        Large
11     32265.0      12.0        Small


In [433]:
stations.dropna(subset=['capacity'], inplace=True)
stations = stations.drop_duplicates(subset='station_id')

In [434]:
df_modified.shape

(97, 46)

In [435]:
df_modified

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,duration_min,...,Bikeshare_REGION_NAME_end,Bikeshare_geometry_end,StartStation_NAME,EndStation_NAME,start_longitude,start_latitude,end_longitude,end_latitude,start_station_size,end_station_size
0,748A93D7DE8A41CD,classic_bike,2024-01-25 15:49:59,2024-01-25 15:52:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.600000,...,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819,Small,Medium
1,75CBFD136F06305B,classic_bike,2024-01-02 16:44:58,2024-01-02 16:53:25,1st & O St NW,31519,4th & College St NW,31138.0,member,8.450000,...,DCA-CABI,POINT (-77.018135 38.921233),1st & O St NW,4th & College St NW,-77.012365,38.908643,-77.018135,38.921233,Small,Small
2,0536C9720F87E04C,classic_bike,2024-01-24 15:40:15,2024-01-24 15:43:55,1st & O St NW,31519,1st & L St NW,31677.0,member,3.666667,...,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819,Small,Medium
3,9E17390C218783B5,classic_bike,2024-01-04 15:35:00,2024-01-04 15:37:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.583333,...,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819,Small,Medium
4,00727D0E773CDFF7,electric_bike,2024-01-05 12:27:58,2024-01-05 12:35:40,1st & O St NW,31519,10th & G St NW,31274.0,casual,7.700000,...,DCA-CABI,POINT (-77.026235 38.898243),1st & O St NW,10th & G St NW,-77.012365,38.908643,-77.026235,38.898243,Small,Large
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,7F6EB9E56D972AD2,classic_bike,2024-01-05 22:58:07,2024-01-05 23:04:08,1st & O St NW,31519,11th & O St NW,31286.0,member,6.016667,...,DCA-CABI,POINT (-77.027088 38.908431),1st & O St NW,11th & O St NW,-77.012365,38.908643,-77.027088,38.908431,Small,Large
93,C6FF417383F01E06,classic_bike,2024-01-13 12:45:26,2024-01-13 12:51:39,Alabama & MLK Ave SE,31800,Alabama Ave & Stanton Rd SE / Shops at Park Vi...,31813.0,casual,6.216667,...,DCA-CABI,POINT (-76.98185 38.84627),Alabama & MLK Ave SE,Alabama Ave & Stanton Rd SE / Shops at Park Vi...,-76.999376,38.843230,-76.981850,38.846270,Small,Medium
94,8DD20FEDB618CE47,classic_bike,2024-01-08 19:11:35,2024-01-08 19:15:42,1st & O St NW,31519,11th & O St NW,31286.0,member,4.116667,...,DCA-CABI,POINT (-77.027088 38.908431),1st & O St NW,11th & O St NW,-77.012365,38.908643,-77.027088,38.908431,Small,Large
95,03A98F39C412ACD9,classic_bike,2024-01-02 17:11:18,2024-01-02 17:23:10,11th & Park Rd NW,31651,11th & O St NW,31286.0,casual,11.866667,...,DCA-CABI,POINT (-77.027088 38.908431),11th & Park Rd NW,11th & O St NW,-77.028247,38.931322,-77.027088,38.908431,Medium,Large


In [436]:
df_modified.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,duration_min,...,Bikeshare_REGION_NAME_end,Bikeshare_geometry_end,StartStation_NAME,EndStation_NAME,start_longitude,start_latitude,end_longitude,end_latitude,start_station_size,end_station_size
0,748A93D7DE8A41CD,classic_bike,2024-01-25 15:49:59,2024-01-25 15:52:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.6,...,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819,Small,Medium
1,75CBFD136F06305B,classic_bike,2024-01-02 16:44:58,2024-01-02 16:53:25,1st & O St NW,31519,4th & College St NW,31138.0,member,8.45,...,DCA-CABI,POINT (-77.018135 38.921233),1st & O St NW,4th & College St NW,-77.012365,38.908643,-77.018135,38.921233,Small,Small
2,0536C9720F87E04C,classic_bike,2024-01-24 15:40:15,2024-01-24 15:43:55,1st & O St NW,31519,1st & L St NW,31677.0,member,3.666667,...,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819,Small,Medium
3,9E17390C218783B5,classic_bike,2024-01-04 15:35:00,2024-01-04 15:37:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.583333,...,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819,Small,Medium
4,00727D0E773CDFF7,electric_bike,2024-01-05 12:27:58,2024-01-05 12:35:40,1st & O St NW,31519,10th & G St NW,31274.0,casual,7.7,...,DCA-CABI,POINT (-77.026235 38.898243),1st & O St NW,10th & G St NW,-77.012365,38.908643,-77.026235,38.898243,Small,Large


In [437]:
shopping_centers = np.array([
    [38.9009, -77.0260],   # CityCenterDC
    [38.8971, -77.0064],   # Union Station
    [38.9057, -77.0631],   # Georgetown Park
    [38.8631, -77.0599],   # Pentagon City
    [38.8765, -77.0316],   # The Wharf
    [38.8741, -77.0028],   # Capitol Riverfront
    [38.9613, -77.0840],   # Friendship Heights
])

In [438]:
proximity = 300
df_modified['start_near_any_mall'] = df_modified.apply(
    lambda row: int(hf.distance_to_closest_mall(row['start_latitude'], row['start_longitude'], shopping_centers) <= proximity),
    axis=1
)

df_modified['end_near_any_mall'] = df_modified.apply(
    lambda row: int(hf.distance_to_closest_mall(row['end_latitude'], row['end_longitude'], shopping_centers) <= proximity),
    axis=1
)

In [439]:
count_of_ones = df_modified['start_near_any_mall'].eq(1).sum()

print(f"Number of 1s in '{'start_near_any_mall'}': {count_of_ones}")

count_of_ones = df_modified['end_near_any_mall'].eq(1).sum()

print(f"Number of 1s in '{'end_near_any_mall'}': {count_of_ones}")

Number of 1s in 'start_near_any_mall': 2
Number of 1s in 'end_near_any_mall': 17


In [440]:
df_modified['end_distance_to_closest_mall_m'] = df_modified.apply(
    lambda row: hf.distance_to_closest_mall(row['end_latitude'], row['end_longitude'], shopping_centers),
    axis=1
)

In [441]:
df_modified.to_csv('datasets/modified.csv', index=False)

In [442]:
null_counts = df_modified.isnull().sum()

# Calculate total number of rows in the DataFrame
total_rows = len(df_modified)

# Calculate percentage of null values for all columns
# Use .apply(lambda x: f"{x:.2f}%") for formatting if you want to store as string
null_percentages = (null_counts / total_rows) * 100

# Create a new DataFrame to store these results
null_summary_df = pd.DataFrame({
    'Null Count': null_counts,
    'Null Percentage': null_percentages
})

# Optional: Sort by Null Count or Null Percentage to see most affected columns first
null_summary_df = null_summary_df.sort_values(by='Null Count', ascending=False)

rows_with_nulls_df = df_modified[df_modified.isnull().any(axis=1)]

null_summary_df

Unnamed: 0,Null Count,Null Percentage
Zone_start_OBJECTID,31,31.958763
Zone_start_geometry,31,31.958763
Zone_start_zone_name,31,31.958763
Zone_start_GIS_ID,31,31.958763
Zone_end_geometry,22,22.680412
Zone_end_OBJECTID,22,22.680412
Zone_end_GIS_ID,22,22.680412
Zone_end_zone_name,22,22.680412
Bikeshare_NAME_end,5,5.154639
Bikeshare_STATION_TYPE_end,5,5.154639


In [443]:
precision = 6

df_modified['start_geohash'] = df_modified.apply(
    lambda row: pgh.encode(row['start_latitude'], row['start_longitude'], precision=precision)
    if not np.isnan(row['start_latitude']) and not np.isnan(row['start_longitude']) else np.nan,
    axis=1
)

df_modified['end_geohash'] = df_modified.apply(
    lambda row: pgh.encode(row['end_latitude'], row['end_longitude'], precision=precision)
    if not np.isnan(row['end_latitude']) and not np.isnan(row['end_longitude']) else np.nan,
    axis=1
)

In [444]:
daily_trip_counts = df_modified.groupby(['start_geohash', 'date']).size().reset_index(name='trip_count')

avg_trips_per_geohash = daily_trip_counts.groupby('start_geohash')['trip_count'].mean().reset_index()
avg_trips_per_geohash.rename(columns={'trip_count': 'avg_daily_trips'}, inplace=True)

quantiles = avg_trips_per_geohash['avg_daily_trips'].quantile([0.22, 0.88])
low_thresh = quantiles[0.22]
high_thresh = quantiles[0.88]

In [445]:
def categorize_traffic(avg):
    if avg <= low_thresh:
        return 'Low Traffic'
    elif avg <= high_thresh:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

In [446]:
avg_trips_per_geohash['traffic_category'] = avg_trips_per_geohash['avg_daily_trips'].apply(categorize_traffic)

df_modified = df_modified.merge(
    avg_trips_per_geohash[['start_geohash', 'traffic_category']],
    on='start_geohash',
    how='left'
)

In [447]:
df_modified.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'member_casual', 'duration_min', 'geometry_start',
       'geometry_end', 'Zone_start_zone_name', 'Zone_start_GIS_ID',
       'Zone_start_OBJECTID', 'Zone_start_geometry', 'Zone_end_zone_name',
       'Zone_end_GIS_ID', 'Zone_end_OBJECTID', 'Zone_end_geometry', 'date',
       'Weather_temp', 'Weather_windspeedmean', 'Weather_sunrise',
       'Weather_sunset', 'Weather_conditions', 'Bikeshare_NAME_start',
       'Bikeshare_STATION_TYPE_start', 'Bikeshare_CAPACITY_start',
       'Bikeshare_REGION_ID_start', 'Bikeshare_REGION_NAME_start',
       'Bikeshare_geometry_start', 'Bikeshare_NAME_end',
       'Bikeshare_STATION_TYPE_end', 'Bikeshare_CAPACITY_end',
       'Bikeshare_REGION_ID_end', 'Bikeshare_REGION_NAME_end',
       'Bikeshare_geometry_end', 'StartStation_NAME', 'EndStation_NAME',
       'start_longitude', 'start_latitude', 'en

In [448]:
# df_modified.to_csv('datasets/modified.csv', index=False)

In [449]:
def classify_weather(condition):
    if pd.isna(condition):
        return np.nan
    condition = condition.lower()
    if any(w in condition for w in ['clear', 'sunny']):
        return 'Sunny'
    elif any(w in condition for w in ['rain', 'storm', 'shower', 'drizzle']):
        return 'Rainy'
    elif any(w in condition for w in ['cloud', 'overcast', 'fog', 'mist']):
        return 'Cloudy'
    else:
        return 'Cloudy'

In [450]:
df_modified['weather_category'] = df_modified['Weather_conditions'].apply(classify_weather)

# Step 3: Drop old weather columns
df_modified.drop(columns=[
    'Weather_temp',
    'Weather_windspeedmean',
    'Weather_sunrise',
    'Weather_sunset',
    'Weather_conditions'
], inplace=True)

In [451]:
df_modified.drop(columns=[
    'Bikeshare_geometry_end',
    'Bikeshare_geometry_start',
    'geometry_end',
    'geometry_start'
], inplace=True)

In [452]:
df_modified

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,duration_min,...,end_latitude,start_station_size,end_station_size,start_near_any_mall,end_near_any_mall,end_distance_to_closest_mall_m,start_geohash,end_geohash,traffic_category,weather_category
0,748A93D7DE8A41CD,classic_bike,2024-01-25 15:49:59,2024-01-25 15:52:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.600000,...,38.903819,Small,Medium,0,0,889.908267,dqcjre,dqcjrd,High Traffic,Rainy
1,75CBFD136F06305B,classic_bike,2024-01-02 16:44:58,2024-01-02 16:53:25,1st & O St NW,31519,4th & College St NW,31138.0,member,8.450000,...,38.921233,Small,Small,0,0,2361.117404,dqcjre,dqcjrm,High Traffic,Cloudy
2,0536C9720F87E04C,classic_bike,2024-01-24 15:40:15,2024-01-24 15:43:55,1st & O St NW,31519,1st & L St NW,31677.0,member,3.666667,...,38.903819,Small,Medium,0,0,889.908267,dqcjre,dqcjrd,High Traffic,Cloudy
3,9E17390C218783B5,classic_bike,2024-01-04 15:35:00,2024-01-04 15:37:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.583333,...,38.903819,Small,Medium,0,0,889.908267,dqcjre,dqcjrd,High Traffic,Cloudy
4,00727D0E773CDFF7,electric_bike,2024-01-05 12:27:58,2024-01-05 12:35:40,1st & O St NW,31519,10th & G St NW,31274.0,casual,7.700000,...,38.898243,Small,Large,0,1,296.143991,dqcjre,dqcjr1,High Traffic,Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,7F6EB9E56D972AD2,classic_bike,2024-01-05 22:58:07,2024-01-05 23:04:08,1st & O St NW,31519,11th & O St NW,31286.0,member,6.016667,...,38.908431,Small,Large,0,0,842.684543,dqcjre,dqcjr5,High Traffic,Cloudy
93,C6FF417383F01E06,classic_bike,2024-01-13 12:45:26,2024-01-13 12:51:39,Alabama & MLK Ave SE,31800,Alabama Ave & Stanton Rd SE / Shops at Park Vi...,31813.0,casual,6.216667,...,38.846270,Small,Medium,0,0,3587.020946,dqchzz,dqckbp,Low Traffic,Rainy
94,8DD20FEDB618CE47,classic_bike,2024-01-08 19:11:35,2024-01-08 19:15:42,1st & O St NW,31519,11th & O St NW,31286.0,member,4.116667,...,38.908431,Small,Large,0,0,842.684543,dqcjre,dqcjr5,High Traffic,Cloudy
95,03A98F39C412ACD9,classic_bike,2024-01-02 17:11:18,2024-01-02 17:23:10,11th & Park Rd NW,31651,11th & O St NW,31286.0,casual,11.866667,...,38.908431,Medium,Large,0,0,842.684543,dqcjrp,dqcjr5,Medium Traffic,Cloudy


In [453]:
df_modified['trip_price'] = df_modified.apply(hf.calculate_price, axis=1)

In [454]:
df_modified['ended_at'] = pd.to_datetime(df_modified['ended_at'])

df_modified['payment_day'] = df_modified['ended_at'].dt.date

daily_revenue = df_modified.groupby(['payment_day', 'weather_category'])['trip_price'].sum().reset_index()

long_format = daily_revenue.copy()

wide_format = daily_revenue.pivot(index='payment_day', columns='weather_category', values='trip_price').fillna(0).reset_index()
wide_format.columns.name = None  # remove pivoted column name

print("Long Format:")
print(long_format.head())

print("\nWide Format:")
print(wide_format.head())

Long Format:
  payment_day weather_category  trip_price
0  2024-01-01            Rainy        3.59
1  2024-01-02           Cloudy        9.18
2  2024-01-03           Cloudy       20.45
3  2024-01-04           Cloudy       11.27
4  2024-01-05           Cloudy        8.68

Wide Format:
  payment_day  Cloudy  Rainy  Sunny
0  2024-01-01    0.00   3.59    0.0
1  2024-01-02    9.18   0.00    0.0
2  2024-01-03   20.45   0.00    0.0
3  2024-01-04   11.27   0.00    0.0
4  2024-01-05    8.68   0.00    0.0


In [484]:
# 1. Bar plot showing statistics of the top 5 start stations that had most trips
top5_starts = df_modified['start_station_name'].value_counts().nlargest(5).reset_index()
top5_starts.columns = ['start_station_name', 'trip_count']
fig1 = px.bar(
    top5_starts,
    x='start_station_name',
    y='trip_count',
    title='Top 5 Start Stations by Trip Count',
    labels={'trip_count': 'Number of Trips', 'start_station_name': 'Start Station'},
    color='trip_count',
    color_continuous_scale='Blues'
)

fig1.show()

In [485]:
# 2. Bar plot: distribution of trips by bike and subscription type
bike_subs_dist = df_modified.groupby(['rideable_type', 'member_casual']).size().reset_index(name='trip_count')
fig2 = px.bar(
    bike_subs_dist,
    x='rideable_type',
    y='trip_count',
    color='member_casual',
    barmode='group',
    title='Trips by Bike Type and Subscription Type',
    labels={'rideable_type': 'Bike Type', 'trip_count': 'Number of Trips', 'member_casual': 'User Type'}
)

fig2.show()

In [486]:
top5_names = top5_starts['start_station_name'].tolist()
sunburst_df = df_modified[df_modified['start_station_name'].isin(top5_names)]
sunburst_data = sunburst_df.groupby(
    ['start_station_name', 'rideable_type', 'member_casual']
).size().reset_index(name='trip_count')

fig3 = px.sunburst(
    sunburst_data,
    path=['start_station_name', 'rideable_type', 'member_casual'],
    values='trip_count',
    title='Trip Distribution for Top 5 Start Stations'
)

fig3.show()

In [487]:
# 4a. Histogram: station size distribution
fig4a = px.histogram(
    df_modified,
    x='start_station_size',
    color='start_station_size',
    title='Distribution of Start Station Sizes',
    labels={'start_station_size': 'Station Size'}
)

fig4a.show()

In [488]:
# 4b. Bar plot: price category vs number of trips
# Create price bins
bins = [0, 4, 7, 20, 21]
labels = ['Low ($0-4)', 'Medium ($4-7)', 'High ($7-20)', 'Very High ($20+)']
df_modified['price_category'] = pd.cut(df_modified['trip_price'], bins=bins, labels=labels)

price_dist = df_modified['price_category'].value_counts().sort_index().reset_index()
price_dist.columns = ['price_category', 'trip_count']
fig4b = px.bar(
    price_dist,
    x='price_category',
    y='trip_count',
    title='Trip Distribution by Price Category',
    labels={'trip_count': 'Number of Trips', 'price_category': 'Price Range'},
    color='price_category'
)

fig4b.show()

In [489]:
fig5 = px.histogram(
    df_modified,
    x="duration_min",
    nbins=100,
    title="Distribution of Trip Duration (Minutes)",
    labels={"duration_min": "Duration (minutes)"}
)
fig5.update_layout(bargap=0.2)

In [490]:
fig6 = px.box(
    df_modified,
    x="rideable_type",
    y="duration_min",
    title="Trip Duration by Bike Type (Log Scale)",
    labels={"rideable_type": "Bike Type", "duration_min": "Duration (minutes)"},
    log_y=True
)
fig6.show()

In [492]:
fig7 = px.box(
    df_modified,
    x="member_casual",
    y="duration_min",
    title="Trip Duration by Subscription Type",
    labels={"member_casual": "User Type", "duration_min": "Duration (minutes)"},
    log_y=True
)


fig7.show()

In [513]:
import folium
from folium.plugins import MarkerCluster

# 1. Check if any trips > 24 hours exist
long_trips = df_modified[df_modified['duration_min'] > 25 ].copy()
print(f"Found {len(long_trips)} long trips (>24h)")

if len(long_trips) == 0:
    print("No trips exceed 24 hours. Adjust your filter or check data.")
else:
    # 2. Use correct column names (adjust based on your actual columns!)
    start_trips = long_trips[['start_station_name', 'start_latitude', 'start_longitude']]
    end_trips = long_trips[['end_station_name', 'end_latitude', 'end_longitude']]

    # Standardize column names
    start_trips.columns = ['station_name', 'lat', 'lon']
    end_trips.columns = ['station_name', 'lat', 'lon']

    # 3. Combine and drop invalid coordinates
    stations_all = pd.concat([start_trips, end_trips]).dropna(subset=['lat', 'lon'])
    print(f"Stations with valid coords: {len(stations_all)}")

    if len(stations_all) > 0:
        # Count trips per station
        station_counts = stations_all.groupby(['station_name', 'lat', 'lon']).size().reset_index(name='trip_count')
        
        # Filter to DC area (optional)
        station_counts = station_counts[
            (station_counts['lat'].between(38.80, 38.97)) & 
            (station_counts['lon'].between(-77.12, -76.90))
        ]
        print("Top stations:\n", station_counts.head())

        # Create map only if data exists
        m = folium.Map(location=[38.9072, -77.0369], zoom_start=13)
        for _, row in station_counts.iterrows():
            folium.CircleMarker(
                location=[row['lat'], row['lon']],
                radius=row['trip_count'] * 0.5,
                color='red',
                fill=True,
                popup=f"{row['station_name']}: {row['trip_count']} long trips"
            ).add_to(m)
        display(m)  # For Jupyter; use `m.save('map.html')` otherwise
    else:
        print("No stations with valid coordinates found.")

Found 10 long trips (>24h)
Stations with valid coords: 19
Top stations:
                      station_name        lat        lon  trip_count
0                  11th & C St SE  38.885908 -76.991476           2
1             17th & Upshur St NW  38.942146 -77.038684           1
2                  24th & N St NW  38.906600 -77.051520           1
3                   5th & F St NW  38.897222 -77.019347           1
4  8th & Eye St SE / Barracks Row  38.879200 -76.995300           3
