In [96]:
import pandas as pd
import matplotlib.pyplot as plt
from haversine import haversine_vector, Unit, haversine
import osmnx as ox
import pickle
import os
from tqdm import tqdm
import folium
from rtree import index

In [None]:
stops_df = pd.read_csv(r"../Dataset/bus_stop.csv")

In [None]:
gps_df = pd.read_csv(r"../Dataset/GPS.csv")

In [None]:
print(len(gps_df))
gps_df.head()

In [None]:
# Drop duplicate based on DEVICE_ID, IST_DATE, LAT and LONGITUDE
gps_df = gps_df.drop_duplicates(subset=['DEVICE_ID','IST_DATE','LAT','LONGITUDE'])

In [None]:
# Filter the rows which LAT and LONGITUDE are between 12 and 14 and 77 and 79
stops_df = stops_df[(stops_df['latitude_current'] > 12.8265) & (stops_df['latitude_current'] < 13.2694) & (stops_df['longitude_current'] > 77.3740) & (stops_df['longitude_current'] < 77.8313)]
gps_df = gps_df[(gps_df['LAT'] > 12) & (gps_df['LAT'] < 13.2694) & (gps_df['LONGITUDE'] > 77.3740) & (gps_df['LONGITUDE'] < 77.8313)]
print(len(gps_df))
gps_df.head()

In [None]:
# Plot the data point based on year, Day and date
gps_df['IST_DATE'] = pd.to_datetime(gps_df['IST_DATE'])
gps_df['YEAR'] = gps_df['IST_DATE'].dt.year
gps_df['DAY'] = gps_df['IST_DATE'].dt.dayofweek
gps_df['DATE'] = gps_df['IST_DATE'].dt.date
gps_df['TIME'] = gps_df['IST_DATE'].dt.time

In [None]:
gps_df['DATE'].value_counts()

In [None]:
# Remove the data points which are not in 2019
gps_df = gps_df[gps_df['YEAR'] == 2019]

In [None]:
# Sort the dataframe by DEVICE_ID and then by IST_DATE
gps_df = gps_df.sort_values(by=['DEVICE_ID','IST_DATE'])
gps_df.head()

In [None]:
# Shift the rows by 1 and get the previous row's LAT and LONGITUDE and IST_DATE
gps_df['PREV_LAT'] = gps_df.groupby(['DEVICE_ID'])['LAT'].shift(1)
gps_df['PREV_LONGITUDE'] = gps_df.groupby(['DEVICE_ID'])['LONGITUDE'].shift(1)
gps_df['PREV_IST_DATE'] = gps_df.groupby(['DEVICE_ID'])['IST_DATE'].shift(1)
gps_df.head()

In [None]:
# Drop the na values
gps_df = gps_df.dropna()

In [None]:
# Calculate the distance between the current row and the previous row using haversine formula
gps_df['DISTANCE'] = haversine_vector(gps_df[['LAT','LONGITUDE']],gps_df[['PREV_LAT','PREV_LONGITUDE']],Unit.METERS)

In [None]:
# Calculate the time difference between the current row and the previous row
gps_df['TIME_DIFF'] = (pd.to_datetime(gps_df['IST_DATE']) - pd.to_datetime(gps_df['PREV_IST_DATE'])).dt.total_seconds()

In [None]:
print(len(gps_df))
gps_df.head()

In [None]:
# PLot the distance and time difference
plt.scatter(gps_df['DISTANCE'],gps_df['TIME_DIFF'])
plt.xlabel('Distance')
plt.ylabel('Time Difference')
plt.show()

In [None]:
gps_df['SPEED'] = gps_df['DISTANCE'] / (gps_df['TIME_DIFF'] +0.0001) *18/5

# Plot the scatter plot speed with respect index
plt.scatter(gps_df.index,gps_df['SPEED'])
plt.ylim(0,200)
plt.xlabel('Index')
plt.ylabel('Speed')
plt.show()

In [None]:
# Value counts of speed between 0 and 1
gps_df[(gps_df['SPEED'] > 0) & (gps_df['SPEED'] < 1)]['SPEED'].count()

In [None]:
gps_df['SPEED'].min()

In [None]:
gps_df_copy = gps_df.copy()

In [None]:
gps_df = gps_df_copy.copy()
gps_df = gps_df.loc[(gps_df["TIME_DIFF"] < 70) & (gps_df["SPEED"] < 100),:]

In [None]:
# reset the index
gps_df = gps_df.reset_index(drop=True)

# Add column stop_time and initialize it with 0
gps_df['STOP_TIME'] = 0

# Take the dataframe with speed 0
gps_df0 = gps_df[gps_df['SPEED'] <= 1]

# # # Iterate over the dataframe with speed 0 and add the time_diff with the previous rows STOP_TIME to the current row STOP_TIME
# prev_index = 0
# for index,row in tqdm(gps_df0.iterrows(),total=len(gps_df0)):
#     if index > 0 and row['DEVICE_ID'] == gps_df0.loc[prev_index,'DEVICE_ID']:
#         gps_df.loc[index,'STOP_TIME'] = gps_df.loc[index-1,'STOP_TIME'] + row['TIME_DIFF']
#     else:
#         gps_df.loc[index,'STOP_TIME'] = row['TIME_DIFF']
#     prev_index = index
    


In [95]:
gps_df = pd.read_csv(r"../Dataset/gps_point1.csv")

In [None]:
gps_df_copy = gps_df.copy()

In [None]:
stop_time = 120
gps_df = gps_df_copy.copy()
st = gps_df.loc[gps_df['STOP_TIME'] > stop_time,['LAT','LONGITUDE']].groupby(['LAT','LONGITUDE']).count()
st = st.reset_index()
len(st)

In [None]:
# Filter the gps_df with respect to STOP_TIME less than 600 and LAT and LONGITUDE are not in st
gps_df = gps_df_copy.copy()
gps_df = gps_df[gps_df['STOP_TIME'] < stop_time]

gps_df = gps_df[~((gps_df['LAT'].isin(st['LAT'])) & (gps_df['LONGITUDE'].isin(st['LONGITUDE'])) & (gps_df['SPEED'] < 10))]

len(gps_df)

In [None]:
# # Plot the st and stops_df on folium plot in different colours and as circular markers
# m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
# for index,row in st.iterrows():
#     folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='red',fill=True).add_to(m)
# for index,row in stops_df.iterrows():
#     folium.CircleMarker([row['latitude_current'],row['longitude_current']],radius=0.01,color='blue',fill=True).add_to(m)
# m.save('stops.html')

In [None]:
gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == 150218177,:]
len(gps_df_bus1)

In [None]:
# Count of pings with respect to DEVICE_ID 
gps_df.groupby(['DEVICE_ID']).size().reset_index().rename(columns={0:'count'}).sort_values(by='count',ascending=False)

In [None]:
# Create a map for each Device ID and plot the gps pings as circular markers
for device_id in gps_df['DEVICE_ID'].unique()[:5]:
    gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == device_id,:]
    m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
    for index,row in gps_df_bus1.iterrows():
        folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='blue',fill=True).add_to(m)
    m.save(f'./maps/bus_{device_id}.html')

In [None]:
# Filter the gps data with respect to speed less than 60
gps_df = gps_df[gps_df['SPEED'] < 120]
print(gps_df.SPEED.mean())

In [None]:
gps_df_bus1_copy = gps_df_bus1.copy()

In [None]:
import osmnx as ox
import pandas as pd
from shapely.geometry import LineString

# Your coordinates or a bounding box to retrieve the road network
# Example: coordinates = (latitude, longitude)
# Example: bounding_box = (north, south, east, west)


# Assuming you have a DataFrame named gps_df with 'latitude' and 'longitude' columns
# and 'segment' column obtained using some clustering or other method
# This can also be a DataFrame with raw GPS points

# Function to map each GPS point to the nearest road segment
def map_to_nearest_segment(row):
    coordinates = (row['LAT'], row['LONGITUDE'])
    G = ox.graph_from_point(coordinates, dist=500, network_type='drive')
    try:
        nearest_node = ox.distance.nearest_nodes(G, row['LONGITUDE'], row['LAT'])
        edge = ox.distance.nearest_edges(G, row['LONGITUDE'], row['LAT'])
        edge_data = G.get_edge_data(edge[0], edge[1])
    except:
        return None, None, None
    # print(edge_data,1,row['LONGITUDE'], row['LAT'])
    # Extracting geometry if available
    if 'geometry' in edge_data:
        edge_geometry = edge_data['geometry']
        edge_coordinates = LineString(edge_geometry).coords.xy
        node_coordinates = nodes[['y', 'x']].values
        return nearest_node, edge, list(zip(edge_coordinates[1], edge_coordinates[0]))
    else:
        return nearest_node, edge, None

gps_df_bus1 = gps_df_bus1.iloc[:100,:]

# Apply the mapping function to each row
tqdm.pandas(desc="Mapping to nearest segment", total=len(gps_df_bus1))
gps_df_bus1[['nearest_node', 'nearest_edge', 'edge_coordinates']] = gps_df_bus1.progress_apply(map_to_nearest_segment, axis=1, result_type='expand')

# Display the resulting DataFrame
print(gps_df_bus1)


In [None]:
import osmnx as ox
import geopandas as gpd
from shapely.geometry import Point

# Your coordinates or a bounding box to retrieve the road network
# Example: coordinates = (latitude, longitude)
# Example: bounding_box = (north, south, east, west)
coordinates = (40.748817, -73.985428)
G = ox.graph_from_point(coordinates, dist=5000, network_type='drive')

# Assuming you have a DataFrame named gps_df with 'latitude' and 'longitude' columns
# This can also be a DataFrame with raw GPS points

# Convert the DataFrame to a GeoDataFrame
geometry = [Point(lon, lat) for lon, lat in zip(gps_df_bus1['LAT'], gps_df_bus1['LONGITUDE'])]
gps_gdf = gpd.GeoDataFrame(gps_df_bus1, geometry=geometry, crs='EPSG:4326')

# Spatial index for road segments
segments= ox.graph_to_gdfs(G, nodes=False, edges=True)

# Use spatial indexing to find the nearest road segment for each GPS point
nearest_segments = gpd.sjoin(gps_gdf, segments, op='intersects', how='left')

# Display the resulting DataFrame with road segments
print(nearest_segments[['LAT', 'LONGITUDE', 'key', 'highway']])


In [None]:
# Spatial index for road segments
segments= ox.graph_to_gdfs(G, nodes=False, edges=True)

# Use spatial indexing to find the nearest road segment for each GPS point
nearest_segments = gpd.sjoin(gps_gdf, segments, op='intersects', how='left')

# Display the resulting DataFrame with road segments
print(nearest_segments[['LAT', 'LONGITUDE', 'key', 'highway']])

In [None]:
haversine(gps_df_bus1.iloc[0,:][['LAT','LONGITUDE']],gps_df_bus1.iloc[-1,:][['LAT','LONGITUDE']],Unit.METERS)

In [None]:
# Value counts of speed
print(len(gps_df))
gps_df['SPEED'].value_counts()

In [None]:
# Get the number of unique gps based on LAT and LONGITUDE
gps_df.groupby(['LAT','LONGITUDE']).size().reset_index().rename(columns={0:'count'}).sort_values(by='count',ascending=False)

In [None]:
# Find stop-stop segments from stop_times.txt
stop_times_df = pd.read_csv(r"../Dataset/route_point.csv")
stop_times_df.head()

In [None]:
def check_ping(point1,point2,ping):
    if ping[0] >= min(point1[0],point2[0]) and ping[0] <= max(point1[0],point2[0]) and ping[1] >= min(point1[1],point2[1]) and ping[1] <= max(point1[1],point2[1]):
        return True
    else:
        return False

In [None]:
# Filter the gps_df with respect to DISTANCE more than 10
gps_df.loc[gps_df['DISTANCE'] > 100,:].head(100)

In [None]:
gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == 150218177,:]
gps_df_bus1.head(100)

In [None]:
mergerd_busstops_routepoints = pd.merge(stop_times_df,stops_df[['bus_stop_id','latitude_current','longitude_current']],on='bus_stop_id')
mergerd_busstops_routepoints.head(5)

In [None]:
# sortby route_id and then by route_order
mergerd_busstops_routepoints = mergerd_busstops_routepoints.sort_values(by=['route_id','route_order'])
mergerd_busstops_routepoints.head(5)

In [None]:
print(len(mergerd_busstops_routepoints))
print(mergerd_busstops_routepoints.route_id.nunique())

In [None]:
# Rename latitude_current and longitude_current to origin_lat and origin_long
mergerd_busstops_routepoints = mergerd_busstops_routepoints.rename(columns={'latitude_current':'origin_lat','longitude_current':'origin_long'})


In [None]:
# Create a new column destination_lat and destination_long and assign with the next row's origin_lat and origin_long for same route_id
mergerd_busstops_routepoints['destination_lat'] = mergerd_busstops_routepoints.groupby(['route_id'])['origin_lat'].shift(-1)
mergerd_busstops_routepoints['destination_long'] = mergerd_busstops_routepoints.groupby(['route_id'])['origin_long'].shift(-1)
# Drop the na values based on destination_lat and destination_long
mergerd_busstops_routepoints = mergerd_busstops_routepoints.dropna(subset=['destination_lat','destination_long'])
mergerd_busstops_routepoints.head(5)


In [None]:
# print the length of the dataframe
print(len(mergerd_busstops_routepoints))

In [None]:
# Drop the duplicates based on origin_lat, origin_long, destination_lat and destination_long
mergerd_busstops_routepoints = mergerd_busstops_routepoints.drop_duplicates(subset=['origin_lat','origin_long','destination_lat','destination_long'])
# reset the index
mergerd_busstops_routepoints = mergerd_busstops_routepoints.reset_index(drop=True)
print(len(mergerd_busstops_routepoints))

In [None]:
# Create length column and calculate the distance between origin_lat, origin_long and destination_lat, destination_long using haversine formula
mergerd_busstops_routepoints['length'] = haversine_vector(mergerd_busstops_routepoints[['origin_lat','origin_long']],mergerd_busstops_routepoints[['destination_lat','destination_long']],Unit.METERS)


In [None]:
# Scatter Plot the length column
plt.scatter(mergerd_busstops_routepoints.index,mergerd_busstops_routepoints['length'])
plt.xlabel('Index')

In [None]:
mergerd_busstops_routepoints_copy = mergerd_busstops_routepoints.copy()

In [None]:
# Find the number of segments with length more than 5000
mergerd_busstops_routepoints = mergerd_busstops_routepoints_copy.copy()

# Filter the mergerd_busstops_routepoints with length more than 5000
# mergerd_busstops_routepoints = mergerd_busstops_routepoints[mergerd_busstops_routepoints['length'] < 2000]

# Plot the scatter plot of length column
plt.scatter(mergerd_busstops_routepoints.index,mergerd_busstops_routepoints['length'])
plt.xlabel('Index')
plt.ylabel('Length')
plt.show()

In [None]:
from rtree import index
# Create an R-tree index
idx = index.Index()
 
# Populate the R-tree index with stop segments
for i, row in mergerd_busstops_routepoints.iterrows():
    start_lat, start_lon, end_lat, end_lon = row['origin_lat'], row['origin_long'], row['destination_lat'], row['destination_long']
    idx.insert(i, (min(start_lat, end_lat), min(start_lon, end_lon), max(start_lat, end_lat), max(start_lon, end_lon)))

In [None]:
# Function to find the stop segment for a given GPS ping
def find_stop_segment(gps_lat, gps_lon):
    for segment_id in idx.intersection((gps_lat, gps_lon, gps_lat, gps_lon)):
        start_lat, start_lon, end_lat, end_lon = mergerd_busstops_routepoints.loc[segment_id, ['origin_lat', 'origin_long', 'destination_lat', 'destination_long']]
        if min(start_lat, end_lat) <= gps_lat <= max(start_lat, end_lat) and min(start_lon, end_lon) <= gps_lon <= max(start_lon, end_lon):
            l1 = haversine((gps_lat, gps_lon), (start_lat, start_lon), Unit.METERS)
            l2 = haversine((gps_lat, gps_lon), (end_lat, end_lon), Unit.METERS)
            l3 = haversine((start_lat, start_lon), (end_lat, end_lon), Unit.METERS)
            if l1 + l2 <= 1.02 * l3:
                return segment_id
    return None  # GPS ping does not belong to any stop segment

In [None]:
import osmnx as ox

G = ox.graph_from_point((13.172057, 77.633171), dist=1000, network_type='drive')

org,dest,distance = ox.nearest_edges(G, 13.172057, 77.633171)
# nodes_gdf, edges_gdf = ox.graph_to_gdfs(G)

# # Ensure that the 'osmid' column is present in the nodes_gdf dataframe
# if 'osmid' not in nodes_gdf.columns:
#     nodes_gdf['osmid'] = nodes_gdf.index.astype(str)

# org_coordinates = nodes_gdf[nodes_gdf['osmid'] == str(org)][['y', 'x']]
# dest_coordinates = nodes_gdf[nodes_gdf['osmid'] == str(dest)][['geometry', 'y', 'x']]


In [None]:
# for all the gps_bus1 points create a graph for each lat long and find the nearest edge and then find the distance between the gps point and the nearest edge

# Create a new column segment_id and initialize it with 0
gps_df_bus1['start'] = 0
gps_df_bus1['end'] = 0
gps_df_bus1['distance'] = 0

# Iterate over the gps_df_bus1 dataframe

for index,row in tqdm(gps_df_bus1.iterrows(),total=len(gps_df_bus1)):

    G = ox.graph_from_point((row['LAT'], row['LONGITUDE']), dist=1000, network_type='drive')
    # Find the nearest edge and the distance between the gps point and the nearest edge
    org,dest,distance = ox.nearest_edges(G, row['LAT'], row['LONGITUDE'])
    # append start, end and distance to the gps_df_bus1 dataframe
    gps_df_bus1.loc[index,'start'] = org
    gps_df_bus1.loc[index,'end'] = dest
    gps_df_bus1.loc[index,'distance'] = distance



In [None]:
gps_df_bus1.distance.value_counts()

In [None]:
mergerd_busstops_routepoints.head(5)

In [None]:
import osmnx as ox
import networkx as nx
import matplotlib.pyplot as plt

# Replace this with the node numbers you have
node_ids = [123456, 789012, ...]  # List of node numbers

# Create a graph from OSMnx for the city or area of your interest
place_name = "City, Country"  # Replace with the appropriate place name
G = ox.graph_from_place(place_name, network_type='drive')

# Create a subgraph using the specified nodes
subgraph = nx.subgraph(G, node_ids)

# Plot the subgraph
fig, ax = ox.plot_graph(subgraph, node_size=0, edge_color='b', bgcolor='w', show=False, close=False)
ox.plot_graph(G, node_size=0, edge_color='k', ax=ax, bgcolor='w', show=True)

# Display the plot
plt.show()


In [None]:
# create a new column segment id and find it using find_stop_segment function
gps_df_bus1['segment_id'] = gps_df_bus1.apply(lambda x: find_stop_segment(x['LAT'],x['LONGITUDE']),axis=1)
gps_df_bus1.head(100)

In [None]:
waybill_df = pd.read_csv(r"../Datasets/waybill.csv")
waybill_df.head()

In [None]:
waybill_df.columns
columns = ['ID','DEVICE_ID','ROUTE_ID']
waybill_df = waybill_df[columns]

In [None]:
# Check the number of route_id in waybill_df present in mergerd_busstops_routepoints
waybill_df[waybill_df['ROUTE_ID'].isin(mergerd_busstops_routepoints['route_id'])]['ROUTE_ID'].nunique()

In [None]:
# Find the route corrsponding to the DEVICE_ID == 150218177
routes = waybill_df[waybill_df['DEVICE_ID'] == 150218177]['ROUTE_ID'].unique()

In [None]:
# Plot the map in folium with respect to DEVICE_ID == 150218177 in  and route_id in routes
m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
for index,row in mergerd_busstops_routepoints.iterrows():
    if row['route_id'] in routes:
        print(row['route_id'])
        folium.Marker([row['origin_lat'],row['origin_long']],radius=0.01,color='red',fill=True).add_to(m)
        folium.PolyLine([[row['origin_lat'],row['origin_long']],[row['destination_lat'],row['destination_long']]],color='blue').add_to(m)
        folium.Marker([row['destination_lat'],row['destination_long']],radius=0.01,color='red',fill=True).add_to(m)
for index,row in gps_df_bus1.iterrows():
    folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='blue',fill=True).add_to(m)
m.save('route.html')

In [None]:
routes

In [None]:
# Plot the gps pings present in gps_df_bus1 as circular marker and origin lat and origin long as red marker from merged_busstops_routepoints as marker
m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
for idex,row in mergerd_busstops_routepoints.iterrows():
    if idex in gps_df_bus1['segment_id'].unique():
        folium.Marker([row['origin_lat'],row['origin_long']],popup=row['route_id'],icon=folium.Icon(color='red')).add_to(m)
        folium.Marker([row['destination_lat'],row['destination_long']],popup=row['route_id'],icon=folium.Icon(color='red')).add_to(m)
        folium.PolyLine([[row['origin_lat'],row['origin_long']],[row['destination_lat'],row['destination_long']]],color='red').add_to(m)
    # folium.Marker([row['origin_lat'],row['origin_long']],popup=row['route_id'],icon=folium.Icon(color='red')).add_to(m)
for index,row in gps_df_bus1.iterrows():
    folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='blue',fill=True).add_to(m)

m.save('bus1.html')
    

In [None]:
# Get the stops cooresponding to the route_id == 1078
stops = stop_times_df.loc[stop_times_df['route_id'] == 1078,'bus_stop_id']
# Merge the stops_df with stops on bus_stop_id 
stops = pd.merge(stops,stops_df[['bus_stop_id','latitude_current','longitude_current']],on='bus_stop_id')

In [None]:
# Plot the stop_lat_long using Folium
import folium
m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
lat_list = stops['latitude_current'].tolist()
long_list = stops['longitude_current'].tolist()
for i in range(len(lat_list)-1):
    folium.Marker([lat_list[i],long_list[i]],icon=folium.Icon(color='blue')).add_to(m)
    folium.PolyLine([[lat_list[i],long_list[i]],[lat_list[i+1],long_list[i+1]]],color='red').add_to(m)       
# Save the map as html file
m.save('stops_full.html')

In [None]:
# Plot the gps pings of DEVICE_ID == 150211826
m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
for index,row in gps_df.loc[gps_df['DEVICE_ID'] == 150218012,:].iterrows():
    folium.Marker([row['LAT'],row['LONGITUDE']],popup=row['DEVICE_ID']).add_to(m)

m.save('gps.html')

In [None]:
gps_df.loc[gps_df['DEVICE_ID'] == 150211826,:].count()

In [None]:
gps_df.DEVICE_ID.unique()