## NDOV Data importing
#### This notebook imports data, cleans it and filters it, and saves it to a csv file.

In [None]:
from pathlib import Path
import pandas as pd
import time

def print_bad_line(bad_line):
    print("Bad line encountered:", bad_line)
    return None  # skip the bad line

names = ["receive", "message", "vehicle", "messagetype", "operatingday", "dataownercode", "lineplanningnumber", "journeynumber", "reinforcementnumber", "userstopcode", "passagesequencenumber", "distancesincelastuserstop", "punctuality", "rd_x", "rd_y", "blockcode", "vehiclenumber", "wheelchairaccessible", "source", "numberofcoaches"]
dirp = Path('/run/media/borek/KINGSTON/ndov/kv6') # directory path
files = [p for p in dirp.iterdir() if p.is_file() and p.name.endswith('.log')]  # only .log files
files_sorted = sorted(files, key=lambda p: p.name)  # Sort files by name

# Load stops to keep (ZOB region)
stops_df = pd.read_csv('./data/stops.csv')
stops_to_keep = stops_df['UserStopCode'].astype(str).tolist()

# Message types to keep
message_types = ['DEPARTURE', 'ARRIVAL']

# Output directory for cleaned files
output_dir = Path('./data/converted')
output_dir.mkdir(exist_ok=True)

dropColumns = ['dataownercode', 'reinforcementnumber', 'passagesequencenumber', 'distancesincelastuserstop', 'blockcode', 'wheelchairaccessible', 'source', 'numberofcoaches']

# Detect already processed files
processed_files = set([f.name for f in output_dir.glob('*_cleaned.csv')])

total_files = len(files_sorted)

print(' Starting file processing loop')
for idx, file in enumerate(files_sorted, 1):
    out_filename = f'{file.stem}_cleaned.csv'
    if out_filename in processed_files:
        continue
    start_time = time.time()
    print(f'Processing {file.name} ({idx}/{total_files})')
    df = pd.read_csv(file, sep=';', names=names, dtype={'userstopcode': 'string'}, encoding_errors='replace', on_bad_lines='warn')
    filtered = df[df['dataownercode'] == 'CXX']
    filtered = filtered[filtered['messagetype'].isin(message_types)]
    filtered = filtered[filtered['userstopcode'].astype(str).isin(stops_to_keep)]
    filtered = filtered.drop(columns=dropColumns)
    filtered = filtered.reset_index(drop=True)
    # Save to CSV
    out_path = output_dir / out_filename
    filtered.to_csv(out_path, index=False)
    elapsed = time.time() - start_time
    percent = (idx / total_files) * 100
    print(f'Saved {out_path} | {percent:.1f}% complete | {elapsed:.2f} seconds')

print('Finished file processing loop')
# Optionally, preview the first processed file
if files_sorted:
    preview_df = pd.read_csv(output_dir / f'{files_sorted[0].stem}_cleaned.csv')
    preview_df.head(5)


#### Get the userstopcodes for the ZOB region    
This should only be run if the stops have not been extracted to a csv file yet from the original NeTEx xml file.

In [4]:
import csv
import xml.etree.ElementTree as ET

def extract_stop_assignments(xml_file, csv_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Get the namespace from the root tag
    ns = {'ns': root.tag.split('}')[0].strip('{')}

    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['StopName', 'UserStopCode'])

        for psa in root.findall('.//ns:PassengerStopAssignment', ns):
            name = psa.findtext('ns:Name', namespaces=ns)
            quay_ref_elem = psa.find('ns:QuayRef', ns)
            quay_ref = quay_ref_elem.get('ref') if quay_ref_elem is not None else ''
            # Improved extraction: find last colon and check if the part after is digits
            if ':' in quay_ref:
                last_part = quay_ref.split(':')[-1]
                if last_part.isdigit():
                    quay_ref_filtered = last_part
                else:
                    quay_ref_filtered = quay_ref
            else:
                quay_ref_filtered = quay_ref
            writer.writerow([name, quay_ref_filtered])

# Usage:
extract_stop_assignments('../data/NeTEx_CXX_SRE_20250917_2025-09-21_202500022_baseline.xml', './data/stops.csv')

#### Filter the data

In [None]:
# Message types to keep
message_types = ['DEPARTURE', 'ARRIVAL']

# Stops to keep (ZOB region)
stops_df = pd.read_csv('./data/stops.csv')
stops_to_keep = stops_df['UserStopCode'].astype(str).tolist()

# Filter for dataownercode == 'CXX', messagetype in message_types and userstopcode in stops_to_keep
filtered = main_df[main_df['dataownercode'] == 'CXX']
filtered = filtered[filtered['messagetype'].isin(message_types)]
filtered = filtered[filtered['userstopcode'].astype(str).isin(stops_to_keep)]

# Print version before dropping columns
filtered.info()

# Drop columns not needed
# Some columns are always null like distancesincelastuserstop, blockcode, wheelchairaccessible and numberofcoaches
# Passagesequencenumber is always 0.0, so we can drop it as well
# Other are not relevant for our analysis
dropColumns = ['dataownercode', 'reinforcementnumber', 'passagesequencenumber', 'distancesincelastuserstop', 'blockcode', 'wheelchairaccessible', 'source', 'numberofcoaches']
filtered = filtered.drop(columns=dropColumns)

# Reset index after filtering
filtered = filtered.reset_index(drop=True)

print(filtered['userstopcode'].apply(type).value_counts())


# Save to CSV and show a preview
filtered.to_csv('./data/cxx_messages.csv', index=False)
filtered.head(5)
