In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re

In [2]:
def get_data(url):
    r = requests.get(url)
    return r.text

In [3]:
url = "https://www.confirmtkt.com/train-running-status/14722"
html_data = get_data(url)
soup= BeautifulSoup(html_data, 'html.parser')

print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="index, follow" name="robots">
<title> 14722 Train Running Status | Spot Your Train 14722 | Live Train Status 14722</title>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="14722 Train Running Status or Live Train Status of ABS JU EXPRESS                                  and Spot Your Train ABS JU EXPRESS                                  accurately in Indian Railways" name="description"/>
<meta content="train running status 14722,spot your train 14722, live train status ,train status14722" name="keywords"/>
<link href="/favicon.ico" rel="icon"/>
<link href="/manifest.json" rel="manifest"/>
<link href="https://www.confirmtkt.com/train-running-status/14722" rel="canonical">
<link href="android-app://com.confirmtkt.lite/http/confirmtkt.com/train-running-status/14722" rel="alternate">
<link hre

In [4]:
import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, timedelta

def get_data(url):
    r = requests.get(url)
    return r.text

def extract_train_data(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    
    # Extracting the script containing the data
    script_tag = soup.find('script', string=re.compile("var data ="))
    if not script_tag:
        return None
    
    script_content = script_tag.string
    
    # Extracting JSON-like data using regex
    pattern = re.compile(r"var data = ({.*?});", re.DOTALL)
    match = pattern.search(script_content)
    if not match:
        return None
    
    json_data = match.group(1)
    
    # Parsing JSON data
    data = json.loads(json_data)
    return data

def transform_data(data):
    # Example: Fetch current date for start date and notification date
    current_date = datetime.now()
    train_start_date = current_date.strftime("%Y-%m-%d")
    notification_date = (current_date + timedelta(days=1)).strftime("%Y-%m-%d")

    # Extracting schedule times
    std = extract_schedule_time(data.get("Schedule", []), "DepartureTime")
    update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S +0530")
    
    # Calculate total distance and current station details
    total_distance = calculate_total_distance(data.get("Schedule", []))
    current_station_info = extract_current_station_info(data.get("Schedule", []))
    
    transformed_data = {
        "success": True,
        "user_id": 0,
        "train_number": data.get("TrainNumberString", ""),
        "train_name": data.get("TrainName", ""),
        "gps_unable": True,
        "train_start_date": train_start_date,
        "notification_date": notification_date,
        "at_src_dstn": False,
        "at_src": False,
        "at_dstn": False,
        "is_run_day": True,
        "source": data.get("SourceCode", ""),
        "destination": data.get("DestinationCode", ""),
        "run_days": ",".join([day[:3].upper() for day, run in data.get("DaysOfRun", {}).items() if run]),
        "journey_time": calculate_journey_time(data.get("TotalDuration", "")),
        "std": std,
        "data_from": "usergps",
        "new_alert_id": 0,
        "new_alert_msg": "",
        "diverted_stations": None,
        "instance_alert": 0,
        "related_alert": 0,
        "late_update": False,
        "is_ry_eta": True,
        "update_time": update_time,
        "distance_from_source": 0,
        "total_distance": total_distance,
        "avg_speed": 0,
        "si_no": 213,
        "current_station_code": current_station_info.get("current_station_code", ""),
        "current_station_name": current_station_info.get("current_station_name", ""),
        "status": "T",
        "eta": current_station_info.get("eta", ""),
        "etd": current_station_info.get("etd", ""),
        "delay": current_station_info.get("delay", 0),
        "ahead_distance": current_station_info.get("ahead_distance", 0),
        "ahead_distance_text": f"{current_station_info.get('ahead_distance', 0)} km ahead",
        "status_as_of": "As of 1 min ago",
        "platform_number": current_station_info.get("platform_number", 0),
        "cur_stn_sta": "21:22",
        "cur_stn_std": "21:22",
        "stoppage_number": 0,
        "a_day": 1,
        "status_as_of_min": 1,
        "dfp_carousel": {},
        "upcoming_stations": transform_stations(data.get("Schedule", []), current_station_info["current_station_code"], upcoming=True),
        "previous_stations": transform_stations(data.get("Schedule", []), current_station_info["current_station_code"], upcoming=False),
        "current_location_info": get_current_location_info(current_station_info, data.get("Schedule", []))
    }
    return transformed_data

def calculate_journey_time(duration_str):
    if not duration_str:
        return 0
    parts = duration_str.split(':')
    return int(parts[0]) * 60 + int(parts[1])

def calculate_total_distance(schedule):
    if not schedule:
        return 0
    return int(float(schedule[-1].get("Distance", "0") or "0"))

def extract_schedule_time(schedule, time_type):
    if not schedule:
        return ""
    for stop in schedule:
        if stop.get(time_type):
            return stop.get(time_type)
    return ""

def extract_current_station_info(schedule):
    if not schedule:
        return {"current_station_code": "", "current_station_name": "", "eta": "", "etd": "", "platform_number": 0}
    
    now = datetime.now()
    current_station = None

    for stop in schedule:
        arrival_time_str = stop.get("ArrivalTime", "")
        if arrival_time_str:
            arrival_time = datetime.strptime(arrival_time_str, "%H:%M")
            if arrival_time.time() <= now.time():
                current_station = stop
            else:
                break
    
    if current_station is None:
        current_station = schedule[-1]

    delay = calculate_delay(current_station, now)
    ahead_distance = calculate_ahead_distance(schedule, current_station)

    return {
        "current_station_code": current_station.get("StationCode", ""),
        "current_station_name": current_station.get("StationName", ""),
        "eta": current_station.get("ArrivalTime", ""),
        "etd": current_station.get("DepartureTime", ""),
        "platform_number": current_station.get("ExpectedPlatformNo", 0),
        "delay": delay,
        "ahead_distance": ahead_distance
    }

def calculate_delay(current_station, current_time):
    arrival_time_str = current_station.get("ArrivalTime", "")
    if not arrival_time_str:
        return 0
    arrival_time = datetime.strptime(arrival_time_str, "%H:%M")
    delay = (current_time - arrival_time.replace(year=current_time.year, month=current_time.month, day=current_time.day)).total_seconds() // 60
    return int(delay)

def calculate_ahead_distance(schedule, current_station):
    current_distance = int(float(current_station.get("Distance", "0") or "0"))
    total_distance = int(float(schedule[-1].get("Distance", "0") or "0"))
    return total_distance - current_distance

def transform_stations(schedule, current_station_code, upcoming=True):
    stations = []
    current_distance = 0
    
    # Find the current station distance
    for stop in schedule:
        if stop.get("StationCode") == current_station_code:
            current_distance = int(float(stop.get("Distance", "0") or "0"))
            break
    
    si_no = 0
    for stop in schedule:
        si_no += 1
        station_distance = int(float(stop.get("Distance", "0") or "0"))
        distance_from_current_station = abs(station_distance - current_distance)
        station = {
            "si_no": si_no,
            "station_code": stop.get("StationCode", ""),
            "station_name": stop.get("StationName", ""),
            "distance_from_source": station_distance,
            "distance_from_current_station": distance_from_current_station,
            "distance_from_current_station_txt": f"{distance_from_current_station} km",
            "sta": stop.get("ArrivalTime", ""),
            "eta": stop.get("ArrivalTime", ""),
            "etd": stop.get("DepartureTime", ""),
            "halt": int(stop.get("HaltMinutes", "0m").replace("m", "") or 0),
            "a_day": stop.get("Day", 1),
            "arrival_delay": 0,  # Calculate if needed
            "platform_number": stop.get("ExpectedPlatformNo", 0),
            "on_time_rating": 0,  # Calculate if needed
            "station_lat": stop.get("Latitude", 0),
            "station_lng": stop.get("Longitude", 0),
            "non_stops": ["Array"]
        }
        if upcoming:
            stations.append(station)
        else:
            stations.insert(0, station)
    return stations

def get_current_location_info(current_station_info, schedule):
    current_station_name = current_station_info.get("current_station_name", "")
    eta = current_station_info.get("eta", "")
    delay = current_station_info.get("delay", 0)
    ahead_distance = current_station_info.get("ahead_distance", 0)
    total_distance = calculate_total_distance(schedule)
    distance_from_source = total_distance - ahead_distance

    # Ensure delay is non-negative
    delay_hours, delay_minutes = divmod(max(delay, 0), 60)

    current_location_info = [
        {
            "type": 1,
            "deeplink": "",
            "img_url": "",
            "label": "As of 1 min ago",
            "message": f"Crossed {current_station_name}. at {eta}",
            "readable_message": f"Crossed {current_station_name.lower()} at {eta}",
            "hint": f"Delay {delay_hours}h:{delay_minutes}m"
        },
        {
            "type": 2,
            "deeplink": "",
            "img_url": "",
            "label": "As of 1 min ago",
            "message": f"{ahead_distance} kms to {current_station_name}",
            "readable_message": f"{ahead_distance} kilometer to {current_station_name.lower()}",
            "hint": f"{distance_from_source} kms Covered so far"
        }
    ]

    # Check for multiple types or conditions
    for stop in schedule:
        if stop.get("StationCode") == current_station_info.get("current_station_code"):
            stop_type = stop.get("StopType", "")
            if stop_type:
                current_location_info.append({
                    "type": 3,
                    "deeplink": "",
                    "img_url": "",
                    "label": f"Special stop - {stop_type}",
                    "message": f"Special stop at {stop.get('StationName')} ({stop.get('StationCode')})",
                    "readable_message": f"Special stop at {stop.get('StationName').lower()} ({stop.get('StationCode').lower()})",
                    "hint": f"Distance from source: {stop.get('Distance', '0')} km"
                })

    return current_location_info

url = "https://www.confirmtkt.com/train-running-status/12953"
html_data = get_data(url)
train_data = extract_train_data(html_data)

if train_data:
    transformed_data = transform_data(train_data)
    print(json.dumps(transformed_data, indent=2))
else:
    print("Failed to extract train data.")


{
  "success": true,
  "user_id": 0,
  "train_number": "12953",
  "train_name": "AK TEJAS RAJ EX                              ",
  "gps_unable": true,
  "train_start_date": "2024-07-09",
  "notification_date": "2024-07-10",
  "at_src_dstn": false,
  "at_src": false,
  "at_dstn": false,
  "is_run_day": true,
  "source": "MMCT",
  "destination": "NZM",
  "run_days": "SUN,MON,TUE,WED,THU,FRI,SAT",
  "journey_time": 993,
  "std": "17:10",
  "data_from": "usergps",
  "new_alert_id": 0,
  "new_alert_msg": "",
  "diverted_stations": null,
  "instance_alert": 0,
  "related_alert": 0,
  "late_update": false,
  "is_ry_eta": true,
  "update_time": "2024-07-09 17:31:44 +0530",
  "distance_from_source": 0,
  "total_distance": 1378,
  "avg_speed": 0,
  "si_no": 213,
  "current_station_code": "NZM",
  "current_station_name": "Hazrat Nizamuddin",
  "status": "T",
  "eta": "09:43",
  "etd": "",
  "delay": 468,
  "ahead_distance": 0,
  "ahead_distance_text": "0 km ahead",
  "status_as_of": "As of 1 min 

In [7]:
import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime, timedelta

def get_data(train_number, date):
    url = f"https://www.confirmtkt.com/train-running-status/{train_number}?date={date}"
    r = requests.get(url)
    return r.text

def extract_train_data(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    
    # Extracting the script containing the data
    script_tag = soup.find('script', string=re.compile("var data ="))
    if not script_tag:
        return None
    
    script_content = script_tag.string
    
    # Extracting JSON-like data using regex
    pattern = re.compile(r"var data = ({.*?});", re.DOTALL)
    match = pattern.search(script_content)
    if not match:
        return None
    
    json_data = match.group(1)
    
    # Parsing JSON data
    data = json.loads(json_data)
    return data

def transform_data(data, provided_date):
    # Convert provided_date to datetime object
    provided_date = datetime.strptime(provided_date, "%Y-%m-%d")
    train_start_date = provided_date.strftime("%Y-%m-%d")
    notification_date = (provided_date + timedelta(days=1)).strftime("%Y-%m-%d")

    # Extracting schedule times
    std = extract_schedule_time(data.get("Schedule", []), "DepartureTime")
    update_time = provided_date.strftime("%Y-%m-%d %H:%M:%S +0530")
    
    # Calculate total distance and current station details
    total_distance = calculate_total_distance(data.get("Schedule", []))
    current_station_info = extract_current_station_info(data.get("Schedule", []), provided_date)
    
    transformed_data = {
        "success": True,
        "user_id": 0,
        "train_number": data.get("TrainNumberString", ""),
        "train_name": data.get("TrainName", ""),
        "gps_unable": True,
        "train_start_date": train_start_date,
        "notification_date": notification_date,
        "at_src_dstn": False,
        "at_src": False,
        "at_dstn": False,
        "is_run_day": True,
        "source": data.get("SourceCode", ""),
        "destination": data.get("DestinationCode", ""),
        "run_days": ",".join([day[:3].upper() for day, run in data.get("DaysOfRun", {}).items() if run]),
        "journey_time": calculate_journey_time(data.get("TotalDuration", "")),
        "std": std,
        "data_from": "usergps",
        "new_alert_id": 0,
        "new_alert_msg": "",
        "diverted_stations": None,
        "instance_alert": 0,
        "related_alert": 0,
        "late_update": False,
        "is_ry_eta": True,
        "update_time": update_time,
        "distance_from_source": 0,
        "total_distance": total_distance,
        "avg_speed": 0,
        "si_no": 213,
        "current_station_code": current_station_info.get("current_station_code", ""),
        "current_station_name": current_station_info.get("current_station_name", ""),
        "status": "T",
        "eta": current_station_info.get("eta", ""),
        "etd": current_station_info.get("etd", ""),
        "delay": current_station_info.get("delay", 0),
        "ahead_distance": current_station_info.get("ahead_distance", 0),
        "ahead_distance_text": f"{current_station_info.get('ahead_distance', 0)} km ahead",
        "status_as_of": "As of 1 min ago",
        "platform_number": current_station_info.get("platform_number", 0),
        "cur_stn_sta": "21:22",
        "cur_stn_std": "21:22",
        "stoppage_number": 0,
        "a_day": 1,
        "status_as_of_min": 1,
        "dfp_carousel": {},
        "upcoming_stations": transform_stations(data.get("Schedule", []), current_station_info["current_station_code"], upcoming=True),
        "previous_stations": transform_stations(data.get("Schedule", []), current_station_info["current_station_code"], upcoming=False),
        "current_location_info": get_current_location_info(current_station_info, data.get("Schedule", []))
    }
    return transformed_data

def calculate_journey_time(duration_str):
    if not duration_str:
        return 0
    parts = duration_str.split(':')
    return int(parts[0]) * 60 + int(parts[1])

def calculate_total_distance(schedule):
    if not schedule:
        return 0
    return int(float(schedule[-1].get("Distance", "0") or "0"))

def extract_schedule_time(schedule, time_type):
    if not schedule:
        return ""
    for stop in schedule:
        if stop.get(time_type):
            return stop.get(time_type)
    return ""

def extract_current_station_info(schedule, provided_date):
    if not schedule:
        return {"current_station_code": "", "current_station_name": "", "eta": "", "etd": "", "platform_number": 0}
    
    now = provided_date
    current_station = None

    for stop in schedule:
        arrival_time_str = stop.get("ArrivalTime", "")
        if arrival_time_str:
            arrival_time = datetime.strptime(arrival_time_str, "%H:%M")
            if arrival_time.time() <= now.time():
                current_station = stop
            else:
                break
    
    if current_station is None:
        current_station = schedule[-1]

    delay = calculate_delay(current_station, now)
    ahead_distance = calculate_ahead_distance(schedule, current_station)

    return {
        "current_station_code": current_station.get("StationCode", ""),
        "current_station_name": current_station.get("StationName", ""),
        "eta": current_station.get("ArrivalTime", ""),
        "etd": current_station.get("DepartureTime", ""),
        "platform_number": current_station.get("ExpectedPlatformNo", 0),
        "delay": delay,
        "ahead_distance": ahead_distance
    }

def calculate_delay(current_station, current_time):
    arrival_time_str = current_station.get("ArrivalTime", "")
    if not arrival_time_str:
        return 0
    arrival_time = datetime.strptime(arrival_time_str, "%H:%M")
    delay = (current_time - arrival_time.replace(year=current_time.year, month=current_time.month, day=current_time.day)).total_seconds() // 60
    return int(delay)

def calculate_ahead_distance(schedule, current_station):
    current_distance = int(float(current_station.get("Distance", "0") or "0"))
    total_distance = int(float(schedule[-1].get("Distance", "0") or "0"))
    return total_distance - current_distance

def transform_stations(schedule, current_station_code, upcoming=True):
    stations = []
    current_distance = 0
    
    # Find the current station distance
    for stop in schedule:
        if stop.get("StationCode") == current_station_code:
            current_distance = int(float(stop.get("Distance", "0") or "0"))
            break
    
    si_no = 0
    for stop in schedule:
        si_no += 1
        station_distance = int(float(stop.get("Distance", "0") or "0"))
        distance_from_current_station = abs(station_distance - current_distance)
        station = {
            "si_no": si_no,
            "station_code": stop.get("StationCode", ""),
            "station_name": stop.get("StationName", ""),
            "distance_from_source": station_distance,
            "distance_from_current_station": distance_from_current_station,
            "distance_from_current_station_txt": f"{distance_from_current_station} km",
            "sta": stop.get("ArrivalTime", ""),
            "eta": stop.get("ArrivalTime", ""),
            "etd": stop.get("DepartureTime", ""),
            "halt": int(stop.get("HaltMinutes", "0m").replace("m", "") or 0),
            "a_day": stop.get("Day", 1),
            "arrival_delay": 0,  # Calculate if needed
            "platform_number": stop.get("ExpectedPlatformNo", 0),
            "on_time_rating": 0,  # Calculate if needed
            "station_lat": stop.get("Latitude", 0),
            "station_lng": stop.get("Longitude", 0),
            "non_stops": ["Array"]
        }
        if upcoming:
            stations.append(station)
        else:
            stations.insert(0, station)
    return stations

def get_current_location_info(current_station_info, schedule):
    current_station_name = current_station_info.get("current_station_name", "")
    eta = current_station_info.get("eta", "")
    delay = current_station_info.get("delay", 0)
    ahead_distance = current_station_info.get("ahead_distance", 0)
    total_distance = calculate_total_distance(schedule)
    distance_from_source = total_distance - ahead_distance

    # Ensure delay is non-negative
    delay_hours, delay_minutes = divmod(max(delay, 0), 60)

    current_location_info = [
        {
            "type": 1,
            "deeplink": "",
            "img_url": "",
            "label": "As of 1 min ago",
            "message": f"Crossed {current_station_name}. at {eta}",
            "readable_message": f"Crossed {current_station_name.lower()} at {eta}",
            "hint": f"Delay {delay_hours}h:{delay_minutes}m"
        },
        {
            "type": 2,
            "deeplink": "",
            "img_url": "",
            "label": "As of 1 min ago",
            "message": f"{ahead_distance} kms to {current_station_name}",
            "readable_message": f"{ahead_distance} kilometer to {current_station_name.lower()}",
            "hint": f"{distance_from_source} kms Covered so far"
        }
    ]

    # Check for multiple types or conditions
    for stop in schedule:
        if stop.get("StationCode") == current_station_info.get("current_station_code"):
            stop_type = stop.get("StopType", "")
            if stop_type:
                current_location_info.append({
                    "type": 3,
                    "deeplink": "",
                    "img_url": "",
                    "label": f"Special stop - {stop_type}",
                    "message": f"Special stop at {stop.get('StationName')} ({stop.get('StationCode')})",
                    "readable_message": f"Special stop at {stop.get('StationName').lower()} ({stop.get('StationCode').lower()})",
                    "hint": f"Distance from source: {stop.get('Distance', '0')} km"
                })

    return current_location_info

train_number = "12953"
date = "2024-06-14"
html_data = get_data(train_number, date)
train_data = extract_train_data(html_data)

if train_data:
    transformed_data = transform_data(train_data, date)
    print(json.dumps(transformed_data, indent=2))
else:
    print("Failed to extract train data.")


{
  "success": true,
  "user_id": 0,
  "train_number": "12953",
  "train_name": "AK TEJAS RAJ EX",
  "gps_unable": true,
  "train_start_date": "2024-06-14",
  "notification_date": "2024-06-15",
  "at_src_dstn": false,
  "at_src": false,
  "at_dstn": false,
  "is_run_day": true,
  "source": "MMCT",
  "destination": "NZM",
  "run_days": "SUN,MON,TUE,WED,THU,FRI,SAT",
  "journey_time": 993,
  "std": "17:10",
  "data_from": "usergps",
  "new_alert_id": 0,
  "new_alert_msg": "",
  "diverted_stations": null,
  "instance_alert": 0,
  "related_alert": 0,
  "late_update": false,
  "is_ry_eta": true,
  "update_time": "2024-06-14 00:00:00 +0530",
  "distance_from_source": 0,
  "total_distance": 1378,
  "avg_speed": 0,
  "si_no": 213,
  "current_station_code": "NZM",
  "current_station_name": "Hazrat Nizamuddin",
  "status": "T",
  "eta": "09:43",
  "etd": "",
  "delay": -583,
  "ahead_distance": 0,
  "ahead_distance_text": "0 km ahead",
  "status_as_of": "As of 1 min ago",
  "platform_number": "7