In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast
import json

In [10]:
def extract_clean_data(request_entry):
    try:
        # Convert string representation of list to actual list
        request_list = ast.literal_eval(request_entry)
        
        # Extract the body field containing JSON data
        body_content = request_list[0]["body"]["body"]
        
        # Convert to a JSON object
        body_dict = json.loads(body_content)
        
        # Extract and parse the "data" field
        clean_data = json.loads(body_dict["data"])
        
        return clean_data
    
    except (SyntaxError, ValueError, KeyError, IndexError):
        return None

def is_valid_bus_stop(cleaned_data):
    if not cleaned_data:
        return None
    
    if float(cleaned_data[0]['distance']) <= 0.0:
        return True
    
    return None

In [11]:
file_path = "/Users/suraj/Library/CloudStorage/OneDrive-PlakshaUniversity/Classes/Sem4/ILGC04/ILGC04_Code/scarping_scripts/scarped_data/raw_request.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Name,Request
0,10/16 ART GALLERY,[{'url': 'https://ctumobileapi.amnex.com/Searc...
1,12/11 (Rally Chowk),[{'url': 'https://ctumobileapi.amnex.com/Searc...
2,16/10 ROSE GARDEN,[{'url': 'https://ctumobileapi.amnex.com/Searc...
3,17 chowk,[{'url': 'https://ctumobileapi.amnex.com/Searc...
4,39 Grain Market,[{'url': 'https://ctumobileapi.amnex.com/Searc...


In [12]:
# Apply extraction function to each row
df["Cleaned_Data"] = df["Request"].apply(extract_clean_data)
print(df[["Name", "Cleaned_Data"]].head())

                  Name                                       Cleaned_Data
0    10/16 ART GALLERY  [{'rowno': 1, 'routeid': 5470, 'stationid': 60...
1  12/11 (Rally Chowk)  [{'rowno': 1, 'routeid': 5570, 'stationid': 11...
2    16/10 ROSE GARDEN  [{'rowno': 1, 'routeid': 5472, 'stationid': 60...
3             17 chowk  [{'rowno': 1, 'routeid': 6999, 'stationid': 61...
4      39 Grain Market  [{'rowno': 1, 'routeid': 5470, 'stationid': 71...


In [13]:
df["Cleaned_Data"].iloc[0]

[{'rowno': 1,
  'routeid': 5470,
  'stationid': 605,
  'stationname': '10/16  ART GALLERY',
  'stationname_m': '10/16  ART GALLERY',
  'center_lat': 30.74824,
  'center_lon': 76.78451,
  'routenames': 'Towards ISBT 43',
  'routenames_m': 'Towards आई.एस.बी.टी. 43',
  'routeno': '4C',
  'distance': '0.00',
  'totalminute': '0',
  'combinestationroute': '10/16  ART GALLERY  : Towards ISBT 43',
  'totalstops': '10'},
 {'rowno': 2,
  'routeid': 5472,
  'stationid': 604,
  'stationname': '16/10  ROSE GARDEN',
  'stationname_m': '16/10 रोज़ गार्डन',
  'center_lat': 30.74819,
  'center_lon': 76.78425,
  'routenames': 'Towards Ram Darbar',
  'routenames_m': 'Towards राम दरबार',
  'routeno': '5A',
  'distance': '0.03',
  'totalminute': '0',
  'combinestationroute': '16/10  ROSE GARDEN  : Towards Ram Darbar',
  'totalstops': '10'},
 {'rowno': 3,
  'routeid': 5469,
  'stationid': 674,
  'stationname': 'Sector 17/16 near Matka Chowk',
  'stationname_m': 'सैक्टर 17/16',
  'center_lat': 30.7458,
  'ce

In [14]:
df["Is_Valid_Stop"] = df["Cleaned_Data"].apply(is_valid_bus_stop)
df["Is_Valid_Stop"].isnull().sum()

732

In [15]:
df.shape

(864, 4)

In [16]:
not_bus_stop_counter = 0

for i in range(len(df["Cleaned_Data"])):
    try:
        dist = float(df["Cleaned_Data"].iloc[i][0]['distance'])
        if dist != 0:
            not_bus_stop_counter += 1
    except:
        not_bus_stop_counter += 1

not_bus_stop_counter

732

In [17]:
def nearest_distance(cleaned_data):
    dist = None
    try:
        dist = float(cleaned_data[0]['distance'])
    finally:
        return dist

df["Nearest_Distance"] = df["Cleaned_Data"].apply(nearest_distance)
df[["Name", "Nearest_Distance"]]

Unnamed: 0,Name,Nearest_Distance
0,10/16 ART GALLERY,0.00
1,12/11 (Rally Chowk),0.00
2,16/10 ROSE GARDEN,0.00
3,17 chowk,0.00
4,39 Grain Market,0.00
...,...,...
859,YPS/Sector 61/52,1.47
860,Zirakpur Bus Stand,3.61
861,Zirakpur (Down),3.55
862,Zirakpur/S/Vihar,3.71


In [21]:
df[df["Nearest_Distance"] != 0].shape[0]

732

In [22]:
df.columns

Index(['Name', 'Request', 'Cleaned_Data', 'Is_Valid_Stop', 'Nearest_Distance'], dtype='object')

In [23]:
df["Is_Valid_Stop"].isna().sum()

732

In [24]:
def extract_coords(row):
    if row["Is_Valid_Stop"] and isinstance(row["Cleaned_Data"], list) and len(row["Cleaned_Data"]) > 0:
        first_entry = row["Cleaned_Data"][0]
        return (first_entry["center_lat"], first_entry["center_lon"])
    return None

# Apply function to create the new column
df["Coords"] = df.apply(extract_coords, axis=1)

In [25]:
df["Coords"].isna().sum()

732

In [26]:
df["lat"] = df["Coords"].apply(lambda coords: coords[0] if coords else None)
df["long"] = df["Coords"].apply(lambda coords: coords[1] if coords else None)

In [27]:
df.to_csv("/Users/suraj/Library/CloudStorage/OneDrive-PlakshaUniversity/Classes/Sem4/ILGC04/ILGC04_Code/scarping_scripts/scarped_data/cleaned_bus_stops_locations.csv")

^ Problem with this is that some stops are clubed like sector 21/22 Aroma and 22/21 Aroma are clubbed as Aroma in the 0th row of the request.

I'll try to go through every row and if i see a bus stop I haven't seen then I'll add it to the table

In [28]:
bus_stops = {
    "name": [],
    "lat": [],
    "long": []
}

In [30]:
df = pd.read_csv("/Users/suraj/Library/CloudStorage/OneDrive-PlakshaUniversity/Classes/Sem4/ILGC04/ILGC04_Code/scarping_scripts/scarped_data/cleaned_bus_stops_locations.csv")

In [37]:
ast.literal_eval(df["Cleaned_Data"].iloc[0])[0]

{'rowno': 1,
 'routeid': 5470,
 'stationid': 605,
 'stationname': '10/16  ART GALLERY',
 'stationname_m': '10/16  ART GALLERY',
 'center_lat': 30.74824,
 'center_lon': 76.78451,
 'routenames': 'Towards ISBT 43',
 'routenames_m': 'Towards आई.एस.बी.टी. 43',
 'routeno': '4C',
 'distance': '0.00',
 'totalminute': '0',
 'combinestationroute': '10/16  ART GALLERY  : Towards ISBT 43',
 'totalstops': '10'}

In [41]:
for cleaned_request in df["Cleaned_Data"]:
    try:
        cleaned_request = ast.literal_eval(cleaned_request)
        for row in cleaned_request:
            if row['stationname'] not in bus_stops['name']:
                bus_stops['name'].append(row['stationname'])
                bus_stops['lat'].append(row['center_lat'])
                bus_stops['long'].append(row['center_lon'])
    except:
        print(cleaned_request)

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [45]:
print(len(bus_stops["name"]))
print(len(bus_stops["lat"]))
print(len(bus_stops["long"]))

530
530
530


In [46]:
bus_stops_df = pd.DataFrame(bus_stops)
bus_stops_df.to_csv("/Users/suraj/Library/CloudStorage/OneDrive-PlakshaUniversity/Classes/Sem4/ILGC04/ILGC04_Code/scarping_scripts/scarped_data/final_locations.csv")