In [None]:
!pip install psycopg2-binary

In [1]:
import requests
import json
from datetime import datetime, timedelta
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, StringType, DoubleType, IntegerType,
    TimestampType, ArrayType
)
from pyspark.sql.functions import col, to_timestamp, lit, coalesce
import psycopg2
from pprint import pprint
import re
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
MAX_WORKERS = 10
API_TIMEOUT = 8

In [3]:
spark = (
    SparkSession.builder
    .appName("TrainNumbersPushToKafka")
    .master("local[*]")
    .config(
        "spark.jars.packages",
        "org.postgresql:postgresql:42.7.4,"
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0"
    )
    .getOrCreate()
)

In [4]:
BASE_URL = "https://easy-rail.onrender.com/fetch-train-status"
date_str = datetime.now().strftime("%d-%m-%Y")
date_str_api = date_str
date_str_table = date_str.replace('-', '_')
print(date_str, date_str_table)

19-08-2025 19_08_2025


In [5]:
JDBC_URL = "jdbc:postgresql://postgres:5432/railway"
DB_PROPERTIES = {
    "user": "postgres",
    "password": "iaCkmHPhuyhFLEBDGdwxQGGqlHvdgWJA",
    "driver": "org.postgresql.Driver"
}

In [None]:
def get_postgres_tables(jdbc_url, db_properties):
    import psycopg2

    conn = psycopg2.connect(
        host="postgres",
        port=5432,
        dbname="railway",
        user=db_properties["user"],
        password=db_properties["password"]
    )
    cur = conn.cursor()
    cur.execute("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema='public' AND table_type='BASE TABLE';
    """)
    tables = [row[0] for row in cur.fetchall()]
    cur.close()
    conn.close()
    return tables

In [None]:
tables = get_postgres_tables(JDBC_URL, DB_PROPERTIES)
print("Found tables:", tables)

In [None]:
def track_train(train_number: str, date_str: str):
    if not train_number or not isinstance(train_number, str) or len(train_number) != 5:
        return {"success": False, "error": "Invalid train number. It must be a 5-character string."}

    if not re.match(r"^\d{2}-\d{2}-\d{4}$", date_str):
        return {"success": False, "error": "Invalid date format. Please use dd-mm-yyyy format."}

    try:
        parsed_date = datetime.strptime(date_str, "%d-%m-%Y")
        if parsed_date.strftime("%d-%m-%Y") != date_str:
            raise ValueError
    except ValueError:
        return {"success": False, "error": "Invalid date. Please check the day, month, and year values."}

    url = "https://easy-rail.onrender.com/fetch-train-status"
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }
    payload = {"trainNumber": train_number, "dates": date_str}

    try:
        response = requests.post(url, json=payload, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()
        if not data:
            return {"success": False, "error": data.get("error", "Failed to fetch train status")}
        return {"success": True, "data": data}
    except requests.RequestException as e:
        return {"success": False, "error": str(e)}

def parse_train_status(train_number, api_data):
    if not api_data or not api_data.get("success"):
        print(f"[WARN] Skipping train {train_number}, API did not return success")
        return None

    stations = api_data.get("data", [])
    last_crossed = None
    next_upcoming = None
    delay_minutes = 0

    for s in stations:
        if s.get("status") == "crossed":
            last_crossed = s.get("station")
        elif s.get("status") == "upcoming" and next_upcoming is None:
            next_upcoming = s.get("station")
            match = re.search(r"\d+", str(s.get("delay", "")))
            delay_minutes = int(match.group()) if match else 0
            break

    print(f"[OK] Processed train {train_number} | Last crossed: {last_crossed} | Next: {next_upcoming} | Delay: {delay_minutes}m")
    return Row(train_number=train_number, last_crossed=last_crossed, next_upcoming=next_upcoming, delay_minutes=delay_minutes)

def process_partition(partition):
    date_str = datetime.now().strftime("%d-%m-%Y")
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(process_train, row, date_str): row for row in partition}
        for future in as_completed(futures):
            result = future.result()
            if result:
                results.append(result)

    return results  

def process_train(row, date_str):
    train_number = "".join(re.findall(r"\d+", str(row.train_number)))
    api_data = track_train(train_number, date_str)
    return parse_train_status(train_number, api_data)

In [6]:
trains_df = spark.read \
    .jdbc(url=JDBC_URL, table="trains", properties=DB_PROPERTIES)

In [None]:
stations_df = spark.read \
    .jdbc(url=JDBC_URL, table="stations", properties=DB_PROPERTIES)

In [None]:
schedules_df = spark.read \
    .jdbc(url=JDBC_URL, table="schedules", properties=DB_PROPERTIES)

In [None]:
live_status_trains_df = spark.read \
    .jdbc(url=JDBC_URL, table="live_status", properties=DB_PROPERTIES)

In [None]:
# train_status_df = trains_df.rdd.mapPartitions(process_partition).toDF()

In [None]:
spark.version

In [7]:
trains_df = trains_df.select(
    col("train_number").cast("string").alias("value")
)

In [8]:
trains_df.write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("topic", "train-numbers") \
    .save()