In [2]:
import pandas as pd
import os
import numpy as np
from geopandas import gpd
import pygeohash as pgh

In [3]:
os.chdir('C:/Users/wissam_T/Desktop/5th/2nd semester/dm/h.w 1/project')

In [4]:
df_merged = pd.read_csv('datasets/Merged_trips_with_stations.csv')

In [5]:
df_merged.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'member_casual', 'duration_min', 'geometry_start',
       'geometry_end', 'Zone_start_zone_name', 'Zone_start_GIS_ID',
       'Zone_start_OBJECTID', 'Zone_start_geometry', 'Zone_end_zone_name',
       'Zone_end_GIS_ID', 'Zone_end_OBJECTID', 'Zone_end_geometry', 'date',
       'Weather_temp', 'Weather_windspeedmean', 'Weather_sunrise',
       'Weather_sunset', 'Weather_conditions', 'Bikeshare_NAME_start',
       'Bikeshare_STATION_TYPE_start', 'Bikeshare_CAPACITY_start',
       'Bikeshare_REGION_ID_start', 'Bikeshare_REGION_NAME_start',
       'Bikeshare_geometry_start', 'Bikeshare_NAME_end',
       'Bikeshare_STATION_TYPE_end', 'Bikeshare_CAPACITY_end',
       'Bikeshare_REGION_ID_end', 'Bikeshare_REGION_NAME_end',
       'Bikeshare_geometry_end', 'StartStation_NAME', 'EndStation_NAME'],
      dtype='object')

In [6]:
print(df_merged['Bikeshare_CAPACITY_start'].isna().sum())
print(df_merged['Bikeshare_CAPACITY_end'].isna().sum())

0
28


In [7]:
df_merged.shape

(97, 40)

In [8]:
df_merged_copy = df_merged.copy()

In [9]:
start_stations = df_merged[['start_station_id', 'Bikeshare_CAPACITY_start']].rename(
    columns={'start_station_id': 'station_id', 'Bikeshare_CAPACITY_start': 'capacity'}
)

end_stations = df_merged[['end_station_id', 'Bikeshare_CAPACITY_end']].rename(
    columns={'end_station_id': 'station_id', 'Bikeshare_CAPACITY_end': 'capacity'}
)

# Combine both into one list of stations
stations = pd.concat([start_stations, end_stations])

# Remove nulls and duplicates (in case some stations appear twice)
stations = stations.drop_duplicates(subset='station_id')

# Step 2: Calculate thresholds (33rd and 66th percentiles)
low_threshold = np.percentile(stations['capacity'], 33)
high_threshold = np.percentile(stations['capacity'], 66)

# Step 3: Define a function to assign category
def assign_size_category(cap):
    if cap < low_threshold:
        return 'Small'
    elif cap < high_threshold:
        return 'Medium'
    else:
        return 'Large'

# Step 4: Apply the function to create the new column
stations['station_size'] = stations['capacity'].apply(assign_size_category)

# Now `stations` has: station_id, capacity, and station_size
print(stations.head())


    station_id  capacity station_size
0      31519.0      15.0        Large
7      31651.0      19.0        Large
9      31939.0      17.0        Large
10     30200.0      27.0        Large
11     32265.0      12.0        Large


In [10]:
# Merge back to the original df for start stations
df_with_sizes = df_merged_copy.merge(
    stations[['station_id', 'station_size']],
    left_on='start_station_id',
    right_on='station_id',
    how='left'
).rename(columns={'station_size': 'start_station_size'}).drop(columns=['station_id'])

# Merge back again for end stations
df_with_sizes = df_with_sizes.merge(
    stations[['station_id', 'station_size']],
    left_on='end_station_id',
    right_on='station_id',
    how='left'
).rename(columns={'station_size': 'end_station_size'}).drop(columns=['station_id'])


In [11]:
df_with_sizes.shape

(97, 42)

In [12]:
df_merged_copy.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'member_casual', 'duration_min', 'geometry_start',
       'geometry_end', 'Zone_start_zone_name', 'Zone_start_GIS_ID',
       'Zone_start_OBJECTID', 'Zone_start_geometry', 'Zone_end_zone_name',
       'Zone_end_GIS_ID', 'Zone_end_OBJECTID', 'Zone_end_geometry', 'date',
       'Weather_temp', 'Weather_windspeedmean', 'Weather_sunrise',
       'Weather_sunset', 'Weather_conditions', 'Bikeshare_NAME_start',
       'Bikeshare_STATION_TYPE_start', 'Bikeshare_CAPACITY_start',
       'Bikeshare_REGION_ID_start', 'Bikeshare_REGION_NAME_start',
       'Bikeshare_geometry_start', 'Bikeshare_NAME_end',
       'Bikeshare_STATION_TYPE_end', 'Bikeshare_CAPACITY_end',
       'Bikeshare_REGION_ID_end', 'Bikeshare_REGION_NAME_end',
       'Bikeshare_geometry_end', 'StartStation_NAME', 'EndStation_NAME'],
      dtype='object')

In [13]:
# First, remove the "POINT (" and ")" parts, then split
df_with_coords = pd.DataFrame(df_merged_copy)
df_with_coords['start_longitude'] = df_with_coords['Bikeshare_geometry_start'].str.extract(r'POINT \((-?\d+\.\d+)')[0].astype(float)
df_with_coords['start_latitude'] = df_with_coords['Bikeshare_geometry_start'].str.extract(r'POINT \(-?\d+\.\d+ (\d+\.\d+)\)')[0].astype(float)

df_with_coords['end_longitude'] = df_with_coords['Bikeshare_geometry_end'].str.extract(r'POINT \((-?\d+\.\d+)')[0].astype(float)
df_with_coords['end_latitude'] = df_with_coords['Bikeshare_geometry_end'].str.extract(r'POINT \(-?\d+\.\d+ (\d+\.\d+)\)')[0].astype(float)


In [14]:
df_with_coords.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,duration_min,...,Bikeshare_CAPACITY_end,Bikeshare_REGION_ID_end,Bikeshare_REGION_NAME_end,Bikeshare_geometry_end,StartStation_NAME,EndStation_NAME,start_longitude,start_latitude,end_longitude,end_latitude
0,748A93D7DE8A41CD,classic_bike,2024-01-25 15:49:59,2024-01-25 15:52:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.6,...,19.0,42.0,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819
1,75CBFD136F06305B,classic_bike,2024-01-02 16:44:58,2024-01-02 16:53:25,1st & O St NW,31519,4th & College St NW,31138.0,member,8.45,...,15.0,42.0,DCA-CABI,POINT (-77.018135 38.921233),1st & O St NW,4th & College St NW,-77.012365,38.908643,-77.018135,38.921233
2,0536C9720F87E04C,classic_bike,2024-01-24 15:40:15,2024-01-24 15:43:55,1st & O St NW,31519,1st & L St NW,31677.0,member,3.666667,...,19.0,42.0,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819
3,9E17390C218783B5,classic_bike,2024-01-04 15:35:00,2024-01-04 15:37:35,1st & O St NW,31519,1st & L St NW,31677.0,member,2.583333,...,19.0,42.0,DCA-CABI,POINT (-77.011987 38.903819),1st & O St NW,1st & L St NW,-77.012365,38.908643,-77.011987,38.903819
4,00727D0E773CDFF7,electric_bike,2024-01-05 12:27:58,2024-01-05 12:35:40,1st & O St NW,31519,10th & G St NW,31274.0,casual,7.7,...,23.0,42.0,DCA-CABI,POINT (-77.026235 38.898243),1st & O St NW,10th & G St NW,-77.012365,38.908643,-77.026235,38.898243


In [15]:
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371000
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    
    a = np.sin(delta_phi / 2.0)**2 + \
        np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

In [16]:
def distance_to_closest_mall(lat, lon):
    if np.isnan(lat) or np.isnan(lon):
        return np.nan
    distances = haversine_np(lat, lon, shopping_centers[:, 0], shopping_centers[:, 1])
    return np.min(distances)

In [17]:
shopping_centers = np.array([
    [38.9009, -77.0260],   # CityCenterDC
    [38.8971, -77.0064],   # Union Station
    [38.9057, -77.0631],   # Georgetown Park
    [38.8631, -77.0599],   # Pentagon City
    [38.8765, -77.0316],   # The Wharf
    [38.8741, -77.0028],   # Capitol Riverfront
    [38.9613, -77.0840],   # Friendship Heights
])

In [18]:
proximity = 300
df_with_coords['start_near_any_mall'] = df_with_coords.apply(
    lambda row: int(distance_to_closest_mall(row['start_latitude'], row['start_longitude']) <= proximity),
    axis=1
)

df_with_coords['end_near_any_mall'] = df_with_coords.apply(
    lambda row: int(distance_to_closest_mall(row['end_latitude'], row['end_longitude']) <= proximity),
    axis=1
)

In [19]:
def closest_mall_info(lat, lon):
    if np.isnan(lat) or np.isnan(lon):
        return (np.nan, np.nan)
    distances = haversine_np(lat, lon, shopping_centers[:, 0], shopping_centers[:, 1])
    idx = np.argmin(distances)
    return distances[idx]

In [20]:
count_of_ones = df_with_coords['start_near_any_mall'].eq(1).sum()

print(f"Number of 1s in '{'start_near_any_mall'}': {count_of_ones}")

count_of_ones = df_with_coords['end_near_any_mall'].eq(1).sum()

print(f"Number of 1s in '{'end_near_any_mall'}': {count_of_ones}")

Number of 1s in 'start_near_any_mall': 2
Number of 1s in 'end_near_any_mall': 9


In [21]:
df_with_coords['end_distance_to_closest_mall_m'] = df_with_coords.apply(
    lambda row: distance_to_closest_mall(row['end_latitude'], row['end_longitude']),
    axis=1
)

In [22]:
df_with_coords.to_csv('datasets/modified.csv', index=False)

In [36]:
precision = 6

df_with_coords['start_geohash'] = df_with_coords.apply(
    lambda row: pgh.encode(row['start_latitude'], row['start_longitude'], precision=precision)
    if not np.isnan(row['start_latitude']) and not np.isnan(row['start_longitude']) else np.nan,
    axis=1
)

df_with_coords['end_geohash'] = df_with_coords.apply(
    lambda row: pgh.encode(row['end_latitude'], row['end_longitude'], precision=precision)
    if not np.isnan(row['end_latitude']) and not np.isnan(row['end_longitude']) else np.nan,
    axis=1
)

In [39]:
# Example: Count daily trips per start_geohash
daily_trip_counts = df_with_coords.groupby(['start_geohash']).size().reset_index(name='trip_count')


In [40]:
daily_trip_counts

Unnamed: 0,start_geohash,trip_count
0,dqchyd,1
1,dqchzz,2
2,dqcj4s,1
3,dqcj71,2
4,dqcjhz,3
5,dqcjj1,5
6,dqcjmb,1
7,dqcjpe,12
8,dqcjps,2
9,dqcjpz,3
