In [2]:
import requests
import json
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, DoubleType, IntegerType,
    TimestampType, ArrayType
)
from pyspark.sql.functions import col, to_timestamp, lit, coalesce
import psycopg2
from pprint import pprint
from pyspark.sql import Row
from concurrent.futures import ThreadPoolExecutor


In [3]:
BASE_URL = "https://easy-rail.onrender.com/fetch-train-status"
date_str = datetime.now().strftime("%d-%m-%Y")
# yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%m-%Y")

print(date_str)

14-08-2025


In [1]:
import re

def track_train(train_number: str, date_str: str):
    # Validate train number
    if not train_number or not isinstance(train_number, str) or len(train_number) != 5:
        return {
            "success": False,
            "error": "Invalid train number. It must be a 5-character string."
        }

    # Validate date format: dd-mm-yyyy
    date_regex = re.compile(r"^\d{2}-\d{2}-\d{4}$")
    if not date_str or not date_regex.match(date_str):
        return {
            "success": False,
            "error": "Invalid date format. Please use dd-mm-yyyy format."
        }

    # Parse and validate date
    try:
        parsed_date = datetime.strptime(date_str, "%d-%m-%Y")
        if (parsed_date.strftime("%d-%m-%Y") != date_str):
            raise ValueError("Invalid date values")
    except ValueError:
        return {
            "success": False,
            "error": "Invalid date. Please check the day, month, and year values."
        }

    # Make POST request
    url = "https://easy-rail.onrender.com/fetch-train-status"
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }
    payload = {
        "trainNumber": train_number,
        "dates": date_str
    }

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status() 
        data = response.json()

        if not data:
            return {
                "success": False,
                "error": data.get("error") if isinstance(data, dict) else "Failed to fetch train status"
            }

        return {
            "success": True,
            "data": data
        }
    except requests.RequestException as e:
        return {
            "success": False,
            "error": str(e) or "Failed to track train"
        }



def parse_train_status(train_number, api_data):
    """Extract last_crossed, next_upcoming, and delay in minutes."""
    if not api_data or not api_data.get("success"):
        print(f"[WARN] Skipping train {train_number}, API did not return success")
        return None  # Return None to skip

    stations = api_data.get("data", [])
    last_crossed = None
    next_upcoming = None
    delay_minutes = 0

    for s in stations:
        if s.get("status") == "crossed":
            last_crossed = s.get("station")
        elif s.get("status") == "upcoming" and next_upcoming is None:
            next_upcoming = s.get("station")
            delay_str = str(s.get("delay", "")).strip()
            match = re.search(r"\d+", delay_str)
            delay_minutes = int(match.group()) if match else 0
            break

    print(f"[OK] Processed train {train_number} | Last crossed: {last_crossed} | Next: {next_upcoming} | Delay: {delay_minutes}m")
    return Row(
        train_number=train_number,
        last_crossed=last_crossed,
        next_upcoming=next_upcoming,
        delay_minutes=delay_minutes
    )

def process_partition(partition):
    """Process trains in a partition using concurrent threads."""
    date_str = datetime.now().strftime("%d-%m-%Y")

    def process_train(row):
        train_number = str(row.train_number)
        api_data = track_train(train_number, date_str)
        result = parse_train_status(train_number, api_data)
        return result

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results = list(executor.map(process_train, partition))
    
    # Filter out any None results (skipped trains)
    return filter(None, results)

In [5]:
data = track_train("12608", date_str)

pprint(data)

{'data': [{'arr': '',
           'current': 'false',
           'delay': ' 15 min',
           'dep': '06:40',
           'index': 0,
           'station': 'KSR Bengaluru',
           'status': 'crossed'},
          {'arr': '06:51',
           'current': 'false',
           'delay': ' 15 min',
           'dep': '06:52',
           'index': 1,
           'station': 'Bengaluru Cant',
           'status': 'crossed'},
          {'arr': '07:03',
           'current': 'false',
           'delay': ' 13 min',
           'dep': '07:04',
           'index': 2,
           'station': 'Krishnarajapurm',
           'status': 'crossed'},
          {'arr': '07:42',
           'current': 'false',
           'delay': ' 14 min',
           'dep': '07:43',
           'index': 3,
           'station': 'Bangarapet',
           'status': 'crossed'},
          {'arr': '08:09',
           'current': 'false',
           'delay': ' 18 min',
           'dep': '08:13',
           'index': 4,
           'station': 

In [3]:
spark = SparkSession.builder.appName("IndianRailwayTrainTracker").master("local[*]").config("spark.jars.packages", "org.postgresql:postgresql:42.7.4").getOrCreate()

In [6]:
data = {"a": 10, "b": 20, "c": 30}

df = spark.createDataFrame(data.items(), ["key", "value"])
df.show()


+---+-----+
|key|value|
+---+-----+
|  a|   10|
|  b|   20|
|  c|   30|
+---+-----+

