In [2]:
import json

input_file = "../Data/irctc/schedules.json"        # Your original file with errors
output_file = "../Data/irctc/schedules_array.json"   # New file to contain valid JSON objects
error_log_file = "error_log.txt"       # File to log errors

valid_objects = []
error_lines = []

# Open the input file and process each line
with open(input_file, "r", encoding="utf-8") as infile:
    for line_number, line in enumerate(infile, start=1):
        line = line.strip()
        if not line:
            continue  # skip blank lines
        try:
            obj = json.loads(line)
            valid_objects.append(obj)
        except json.JSONDecodeError as e:
            error_message = f"Line {line_number}: {e}"
            error_lines.append(error_message)
            # Optionally print the error message
            # print(error_message)

# Write valid objects into a JSON array
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(valid_objects, outfile, indent=2)

# Optionally, write errors to a log file for further review
if error_lines:
    with open(error_log_file, "w", encoding="utf-8") as log_file:
        for error in error_lines:
            log_file.write(error + "\n")

# print(f"Processing complete. {len(valid_objects)} valid objects written to {output_file}.")
# if error_lines:
#     print(f"{len(error_lines)} errors logged to {error_log_file}.")


In [3]:
import json
import pandas as pd

# 1. Load the JSON array from your file.
with open('../Data/irctc/schedules.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Flatten the JSON: each row will be a stop from a train schedule.
df = pd.json_normalize(
    data,
    record_path=['stationList'],
    meta=['trainNumber',
          'trainRunsOnMon', 'trainRunsOnTue', 'trainRunsOnWed',
          'trainRunsOnThu', 'trainRunsOnFri', 'trainRunsOnSat',
          'trainRunsOnSun']
)

# 3. Convert run-day flags (which are "Y" or "N") to numeric values.
run_flags = ['trainRunsOnMon', 'trainRunsOnTue', 'trainRunsOnWed',
             'trainRunsOnThu', 'trainRunsOnFri', 'trainRunsOnSat', 'trainRunsOnSun']

for flag in run_flags:
    df[flag] = df[flag].map({'Y': 1, 'N': 0})

# 4. Calculate runs per week for each schedule (each row gets the same weekly value).
df['runsPerWeek'] = df[run_flags].sum(axis=1)

# 5. Group by station code and sum up the weekly stops.
busiest_stations = df.groupby('stationCode')['runsPerWeek'].sum().sort_values(ascending=False)

# 6. Display the top 10 busiest station codes.
print(busiest_stations.head(10))


stationCode
NDLS    1199.0
CNB     1132.0
BZA     1012.0
HWH      964.0
ET       960.0
KYN      933.0
ST       929.0
BRC      887.0
BSL      860.0
UMB      843.0
Name: runsPerWeek, dtype: float64


In [4]:
# Assuming you have already computed busiest_stations as follows:
# busiest_stations = df.groupby('stationCode')['runsPerWeek'].sum().sort_values(ascending=False)

# Save the Series to a CSV file
busiest_stations.to_csv("../Data/precomputes/weekly_traffic_stations.csv", header=["weeklyStops"])
