In [96]:
import pandas as pd
import matplotlib.pyplot as plt
from haversine import haversine_vector, Unit, haversine
import osmnx as ox
import pickle
import os
from tqdm import tqdm
import folium
from rtree import index

In [None]:
stops_df = pd.read_csv(r"../Dataset/bus_stop.csv")

In [None]:
gps_df = pd.read_csv(r"../Dataset/GPS.csv")

In [None]:
print(len(gps_df))
gps_df.head()

In [None]:
# Drop duplicate based on DEVICE_ID, IST_DATE, LAT and LONGITUDE
gps_df = gps_df.drop_duplicates(subset=['DEVICE_ID','IST_DATE','LAT','LONGITUDE'])

In [None]:
# Filter the rows which LAT and LONGITUDE are between 12 and 14 and 77 and 79
stops_df = stops_df[(stops_df['latitude_current'] > 12.8265) & (stops_df['latitude_current'] < 13.2694) & (stops_df['longitude_current'] > 77.3740) & (stops_df['longitude_current'] < 77.8313)]
gps_df = gps_df[(gps_df['LAT'] > 12) & (gps_df['LAT'] < 13.2694) & (gps_df['LONGITUDE'] > 77.3740) & (gps_df['LONGITUDE'] < 77.8313)]
print(len(gps_df))
gps_df.head()

In [None]:
# Plot the data point based on year, Day and date
gps_df['IST_DATE'] = pd.to_datetime(gps_df['IST_DATE'])
gps_df['YEAR'] = gps_df['IST_DATE'].dt.year
gps_df['DAY'] = gps_df['IST_DATE'].dt.dayofweek
gps_df['DATE'] = gps_df['IST_DATE'].dt.date
gps_df['TIME'] = gps_df['IST_DATE'].dt.time

In [None]:
gps_df['DATE'].value_counts()

In [None]:
# Remove the data points which are not in 2019
gps_df = gps_df[gps_df['YEAR'] == 2019]

In [None]:
# Sort the dataframe by DEVICE_ID and then by IST_DATE
gps_df = gps_df.sort_values(by=['DEVICE_ID','IST_DATE'])
gps_df.head()

In [None]:
# Shift the rows by 1 and get the previous row's LAT and LONGITUDE and IST_DATE
gps_df['PREV_LAT'] = gps_df.groupby(['DEVICE_ID'])['LAT'].shift(1)
gps_df['PREV_LONGITUDE'] = gps_df.groupby(['DEVICE_ID'])['LONGITUDE'].shift(1)
gps_df['PREV_IST_DATE'] = gps_df.groupby(['DEVICE_ID'])['IST_DATE'].shift(1)
gps_df.head()

In [None]:
# Drop the na values
gps_df = gps_df.dropna()

In [None]:
# Calculate the distance between the current row and the previous row using haversine formula
gps_df['DISTANCE'] = haversine_vector(gps_df[['LAT','LONGITUDE']],gps_df[['PREV_LAT','PREV_LONGITUDE']],Unit.METERS)

In [None]:
# Calculate the time difference between the current row and the previous row
gps_df['TIME_DIFF'] = (pd.to_datetime(gps_df['IST_DATE']) - pd.to_datetime(gps_df['PREV_IST_DATE'])).dt.total_seconds()

In [None]:
print(len(gps_df))
gps_df.head()

In [None]:
# PLot the distance and time difference
plt.scatter(gps_df['DISTANCE'],gps_df['TIME_DIFF'])
plt.xlabel('Distance')
plt.ylabel('Time Difference')
plt.show()

In [None]:
gps_df['SPEED'] = gps_df['DISTANCE'] / (gps_df['TIME_DIFF'] +0.0001) *18/5

# Plot the scatter plot speed with respect index
plt.scatter(gps_df.index,gps_df['SPEED'])
plt.ylim(0,200)
plt.xlabel('Index')
plt.ylabel('Speed')
plt.show()

In [None]:
# Value counts of speed between 0 and 1
gps_df[(gps_df['SPEED'] > 0) & (gps_df['SPEED'] < 1)]['SPEED'].count()

In [None]:
gps_df['SPEED'].min()

In [None]:
gps_df_copy = gps_df.copy()

In [None]:
gps_df = gps_df_copy.copy()
gps_df = gps_df.loc[(gps_df["TIME_DIFF"] < 70) & (gps_df["SPEED"] < 100),:]

In [None]:
# reset the index
gps_df = gps_df.reset_index(drop=True)

# Add column stop_time and initialize it with 0
gps_df['STOP_TIME'] = 0

# Take the dataframe with speed 0
gps_df0 = gps_df[gps_df['SPEED'] <= 1]

# # # Iterate over the dataframe with speed 0 and add the time_diff with the previous rows STOP_TIME to the current row STOP_TIME
# prev_index = 0
# for index,row in tqdm(gps_df0.iterrows(),total=len(gps_df0)):
#     if index > 0 and row['DEVICE_ID'] == gps_df0.loc[prev_index,'DEVICE_ID']:
#         gps_df.loc[index,'STOP_TIME'] = gps_df.loc[index-1,'STOP_TIME'] + row['TIME_DIFF']
#     else:
#         gps_df.loc[index,'STOP_TIME'] = row['TIME_DIFF']
#     prev_index = index
    


In [169]:
gps_df = pd.read_csv(r"../Dataset/gps_point1.csv")

In [170]:
gps_df_copy = gps_df.copy()

In [171]:
stop_time = 120
gps_df = gps_df_copy.copy()
st = gps_df.loc[gps_df['STOP_TIME'] > stop_time,['LAT','LONGITUDE']].groupby(['LAT','LONGITUDE']).count()
st = st.reset_index()
len(st)

530190

In [172]:
# Filter the gps_df with respect to STOP_TIME less than 600 and LAT and LONGITUDE are not in st
gps_df = gps_df_copy.copy()
gps_df = gps_df[gps_df['STOP_TIME'] < stop_time]

gps_df = gps_df[~((gps_df['LAT'].isin(st['LAT'])) & (gps_df['LONGITUDE'].isin(st['LONGITUDE'])) & (gps_df['SPEED'] < 10))]

len(gps_df)

6177166

In [173]:
gps_df_copy = gps_df.copy()

In [176]:
# Filter the gps_df with repect to DEVICE_ID where total distance greater than 10000
gps_df = gps_df_copy.copy()
gps_df = gps_df.groupby(['DEVICE_ID']).filter(lambda x: x['DISTANCE'].sum() > 50000)
len(gps_df)

6133697

In [178]:
gps_df.DEVICE_ID.value_counts().mean()

2382.9436674436674

In [181]:
# Filter the gps_df with repect to DEVICE_ID where number of pings greater than 1000
gps_df2 = gps_df.groupby(['DEVICE_ID']).filter(lambda x: len(x) > 2000)
len(gps_df2)

4857906

In [130]:
gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == 150218177,:]
len(gps_df_bus1)

3876

In [None]:
# Create a map for each Device ID and plot the gps pings as circular markers
for device_id in gps_df['DEVICE_ID'].unique()[:5]:
    gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == device_id,:]
    m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
    for index,row in gps_df_bus1.iterrows():
        folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='blue',fill=True).add_to(m)
    m.save(f'./maps/bus_{device_id}.html')

In [None]:
# Filter the gps data with respect to speed less than 60
gps_df = gps_df[gps_df['SPEED'] < 120]
print(gps_df.SPEED.mean())

In [None]:
gps_df_bus1_copy = gps_df_bus1.copy()

In [None]:
gps_df_bus1 = gps_df.loc[gps_df['DEVICE_ID'] == 150218177,:]
gps_df_bus1.head(100)

In [108]:
G = ox.graph_from_bbox(12.8265, 13.2694, 77.3740, 77.8313, network_type='drive')

# edges from G
edges = ox.graph_to_gdfs(G, nodes=False, edges=True)

In [187]:
edges_df = pd.DataFrame(edges)

In [188]:
# edges_df reset index
edges_df = edges_df.reset_index(drop=False)
# Filter the edges_df based on length
edges_df = edges_df[(edges_df['length'] > 20) & (edges_df['length'] < 200)]

# Consider only the columns osmid, highway, length, geometry,reversed, oneway, u,v
edges_df = edges_df[['osmid','highway','length','geometry','reversed','oneway','u','v']]

# Create lat_u, long_u, lat_v, long_v columns and assign from geometry
edges_df['lat_u'] = edges_df['geometry'].apply(lambda x: x.coords[0][1])
edges_df['long_u'] = edges_df['geometry'].apply(lambda x: x.coords[0][0])
edges_df['lat_v'] = edges_df['geometry'].apply(lambda x: x.coords[-1][1])
edges_df['long_v'] = edges_df['geometry'].apply(lambda x: x.coords[-1][0])

In [189]:
edges_df.head(5)

Unnamed: 0,osmid,highway,length,geometry,reversed,oneway,u,v,lat_u,long_u,lat_v,long_v
2,1166304119,secondary,29.724,"LINESTRING (77.59872 12.91054, 77.59899 12.91056)",False,True,17327095,305154531,12.910542,77.598721,12.91056,77.598994
3,45443685,residential,122.135,"LINESTRING (77.57933 12.98596, 77.57841 12.98659)",False,False,17327433,578121591,12.985958,77.57933,12.986587,77.578405
4,148319042,tertiary,33.303,"LINESTRING (77.57933 12.98596, 77.57951 12.98620)",False,False,17327433,10279595643,12.985958,77.57933,12.986202,77.579507
5,148319042,tertiary,84.413,"LINESTRING (77.57933 12.98596, 77.57921 12.985...",True,False,17327433,429286716,12.985958,77.57933,12.985321,77.578905
6,23029696,secondary,107.589,"LINESTRING (77.60480 12.97090, 77.60476 12.970...",False,True,60952254,423784673,12.970899,77.6048,12.971792,77.604417


In [190]:
from rtree import index
# Create an R-tree index
idx = index.Index()
 
# Populate the R-tree index with stop segments
for i, row in tqdm(edges_df.iterrows(),total=len(edges_df)):
    start_lat, start_lon, end_lat, end_lon = row['lat_u'], row['long_u'], row['lat_v'], row['long_v']
    idx.insert(i, (min(start_lat, end_lat), min(start_lon, end_lon), max(start_lat, end_lat), max(start_lon, end_lon)))

100%|██████████| 479348/479348 [01:14<00:00, 6445.68it/s]


In [191]:
# Function to find the stop segment for a given GPS ping
def find_stop_segment(gps_lat, gps_lon):
    for segment_id in idx.intersection((gps_lat, gps_lon, gps_lat, gps_lon)):
        start_lat, start_lon, end_lat, end_lon = edges_df.loc[segment_id, ['lat_u', 'long_u', 'lat_v', 'long_v']]
        if min(start_lat, end_lat) <= gps_lat <= max(start_lat, end_lat) and min(start_lon, end_lon) <= gps_lon <= max(start_lon, end_lon):
            l1 = haversine((gps_lat, gps_lon), (start_lat, start_lon), Unit.METERS)
            l2 = haversine((gps_lat, gps_lon), (end_lat, end_lon), Unit.METERS)
            l3 = haversine((start_lat, start_lon), (end_lat, end_lon), Unit.METERS)
            if l1 + l2 <= 1.02 * l3:
                return segment_id
    return None  # GPS ping does not belong to any stop segment

In [192]:
# create a new column segment id and find it using find_stop_segment function using tqdm
print("Mapping to nearest segment...")
length = len(gps_df)
segment_list = []
for index, row in tqdm(gps_df.iterrows(),total=length):
    segment_list.append(find_stop_segment(row['LAT'], row['LONGITUDE']))

Mapping to nearest segment...


100%|██████████| 6133697/6133697 [39:49<00:00, 2567.35it/s]


In [193]:
gps_df['segment_id'] = segment_list

In [195]:
# Save the gps_df to csv
gps_df.to_csv(r"../Dataset/gps_point2.csv",index=False)

In [196]:
# Save the edges_df to csv
edges_df.to_csv(r"../Dataset/edges.csv",index=False)

In [197]:
gps_df.segment_id.notna().sum()

2002917

In [None]:
gps_df.segment_id.value_counts().min()

In [199]:
gps_df_1 = gps_df.loc[gps_df.segment_id.notna(),:]
segments = gps_df_1.segment_id.unique().tolist()

segments_df = edges_df.loc[edges_df.index.isin(segments),['osmid','lat_u','long_u','lat_v','long_v']]
# create folium map and plot the gps pings as circular markers and segments as lines and markers
m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
for index,row in tqdm(gps_df_1.iterrows()):
    folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='blue',fill=True).add_to(m)
for index,row in tqdm(segments_df.iterrows()):
    folium.PolyLine([[row['lat_u'],row['long_u']],[row['lat_v'],row['long_v']]],color='red').add_to(m)
    folium.Marker([row['lat_u'],row['long_u']],icon=folium.Icon(color='red')).add_to(m)
    folium.Marker([row['lat_v'],row['long_v']],icon=folium.Icon(color='red')).add_to(m)
m.save('bus1.html')

2002917it [02:51, 11710.84it/s]
31434it [00:06, 4562.48it/s]


In [149]:
len(gps_df_bus1)

3876

In [157]:
gps_df_bus2 = gps_df_bus1.loc[gps_df_bus1.segment_id.notna(),:]
segments = gps_df_bus2.segment_id.unique().tolist()

segments_df = edges_df.loc[edges_df.index.isin(segments),['osmid','lat_u','long_u','lat_v','long_v']]
# create folium map and plot the gps pings as circular markers and segments as lines and markers
m = folium.Map(location=[12.9716,77.5946],zoom_start=12)
for index,row in gps_df_bus2.iterrows():
    folium.CircleMarker([row['LAT'],row['LONGITUDE']],radius=0.01,color='blue',fill=True).add_to(m)
for index,row in segments_df.iterrows():
    folium.PolyLine([[row['lat_u'],row['long_u']],[row['lat_v'],row['long_v']]],color='red').add_to(m)
    folium.Marker([row['lat_u'],row['long_u']],icon=folium.Icon(color='red')).add_to(m)
    folium.Marker([row['lat_v'],row['long_v']],icon=folium.Icon(color='red')).add_to(m)
m.save('bus1.html')

In [154]:
gps_df_bus1[gps_df_bus1['segment_id'].isna()].SPEED.value_counts()

SPEED
0.000000     55
0.080060      4
0.040030      4
23.737688     2
0.560418      2
             ..
39.293305     1
27.122865     1
11.618949     1
27.401550     1
84.389435     1
Name: count, Length: 2079, dtype: int64