##### Part 1: Generate dataset without weather features

In [None]:
import numpy as np
import pandas as pd
import os
import datetime


csv_dir = "../data/converted/"
files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]
last62files = sorted(files)[-62:]

cxx_df = pd.concat([pd.read_csv(f, usecols=['messagetype', 'operatingday', 'journeynumber', 'userstopcode', 'punctuality', 'lineplanningnumber'], dtype={'messagetype': 'category', 'operatingday': 'string', 'lineplanningnumber': 'string', 'journeynumber': np.uint16, 'userstopcode': np.int32, 'punctuality': np.float16, 'rd_x': np.float32, 'rd_y': np.float32, 'vehiclenumber': np.float16, 'userstopcode': 'string'}) for f in last62files], ignore_index=True)

filtered_df = cxx_df[(cxx_df['lineplanningnumber'] == 'L011') & (cxx_df['messagetype'] == 'DEPARTURE')]
print(filtered_df[['journeynumber', 'operatingday']].drop_duplicates().shape[0], 'unique journeys in dataset.')
unique_stops_array = filtered_df['userstopcode'].unique()
print('Unique stops on line 011:', len(unique_stops_array))
amount_of_stops = len(unique_stops_array)

to_station = unique_stops_array[:amount_of_stops//2 + 1]
to_station_indexes_map = {stop: idx for idx, stop in enumerate(to_station)}
from_station = unique_stops_array[amount_of_stops//2:]
from_station_indexes_map = {stop: idx for idx, stop in enumerate(from_station)}
print('To station indexes map:', to_station_indexes_map)
print('From station indexes map:', from_station_indexes_map)

# Add day_type column:
def get_day_type(date_str):
    dt = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    weekday = dt.weekday()
    return weekday

filtered_df['day_type'] = filtered_df['operatingday'].apply(get_day_type)

all_combinations = []
all_messages = []
# Use both operatingday and journeynumber as unique journey identifier
unique_journeys = filtered_df[['operatingday', 'journeynumber']].drop_duplicates()
for _, journey_row in unique_journeys.iterrows():
    operatingday = journey_row['operatingday']
    journeynumber = journey_row['journeynumber']
    df_journey = filtered_df[(filtered_df['operatingday'] == operatingday) & (filtered_df['journeynumber'] == journeynumber)]
    if df_journey.empty:
        continue
    rows = []

    # Ensure df_journey is ordered by stop sequence (keep your existing sort if you want)
    stops_011_order = {stop: idx for idx, stop in enumerate(unique_stops_array)}
    df_journey = df_journey.sort_values(by='userstopcode', key=lambda x: x.map(stops_011_order))

    # IMPORTANT: make index positional so i/j comparisons follow the sorted order
    df_journey = df_journey.reset_index(drop=True)

    first_stop = df_journey.iloc[0]['userstopcode']
    if first_stop == list(from_station_indexes_map.keys())[0]:
        direction = 0 
    else:
        direction = 1

    # NEW: per-journey stop index map (prevents -1 due to missing global keys)
    stop_index_map = {}
    for pos, stop in enumerate(df_journey['userstopcode'].tolist()):
        stop_index_map.setdefault(stop, pos)  # keep first occurrence if duplicates exist

    for i, current_row in df_journey.iterrows():
        for j, target_row in df_journey.iterrows():
            if current_row['userstopcode'] == target_row['userstopcode']:
                continue
            if j <= i:  # only consider forward combinations (now works as intended)
                continue

            rows.append({
                'to_station': direction,
                'operatingday': operatingday,
                'day_type': current_row['day_type'],
                'journeynumber': journeynumber,
                'current_stop': current_row['userstopcode'],
                'current_stop_index': stop_index_map.get(current_row['userstopcode'], -1),
                'current_delay': current_row['punctuality'],
                'target_stop_index': stop_index_map.get(target_row['userstopcode'], -1),
                'target_stop': target_row['userstopcode'],
                'target_delay': target_row['punctuality']
            })

    combinations_df = pd.DataFrame(rows)
    all_combinations.append(combinations_df)
    all_messages.append(df_journey)

if all_combinations:
    final_combinations_df = pd.concat(all_combinations, ignore_index=True)
    final_combinations_df.to_csv('./input/line011_combinations_exp.csv', index=False)
if all_messages:
    final_messages_df = pd.concat(all_messages, ignore_index=True)
    final_messages_df.to_csv('./input/line011_messages_exp.csv', index=False)

In [None]:
# Print the amount of rows in the final combinations dataframe from csv export
import pandas as pd
final_combinations_df = pd.read_csv('./input/line011_combinations_exp.csv')
print('Total combinations in exported CSV:', final_combinations_df.shape[0])

##### Part 2.1: Parse/Prepare weather data to csv

In [None]:
import pandas as pd
from datetime import datetime

# Load the weather data file
df = pd.read_csv('../../ndov-dataset/data/etmgeg_370.txt', skipinitialspace=True)

# Select relevant columns
weather_df = df[['YYYYMMDD', 'FG', 'TG', 'RH']]

# Rename columns for clarity
weather_df = weather_df.rename(columns={
    'YYYYMMDD': 'date',
    'FG': 'FR/windspeed (0.1 m/s)',
    'TG': 'TG/temperature (0.1 Â°C)',
    'RH': 'RH/precipitation (0.1mm)'
})

# Convert date column to datetime
weather_df['date'] = pd.to_datetime(weather_df['date'], format='%Y%m%d', errors='coerce')

# Filter for dates between 2025-06-08 and 2025-08-12 (inclusive)
start_date = datetime(2025, 6, 8)
end_date = datetime(2025, 8, 12)
filtered_weather_df = weather_df[(weather_df['date'] >= start_date) & (weather_df['date'] <= end_date)]

print(filtered_weather_df.head())
filtered_weather_df.to_csv('./input/weather.csv', index=False)

In [None]:
# Analysis of precipitation (RH) in filtered period
rh_values = filtered_weather_df['RH/precipitation (0.1mm)']
print('RH (precipitation) statistics for selected period:')
print('Min:', rh_values.min())
print('Max:', rh_values.max())
print('Mean:', rh_values.mean())
print('Std (spread):', rh_values.std())

##### Part 2.2: Generate set with weather features added

In [None]:
import pandas as pd

# Load journey combinations
combinations_df = pd.read_csv('./input/line011_combinations_exp.csv')


# Load weather data
weather_df = pd.read_csv('./input/weather.csv')

# Ensure date columns are strings for matching
weather_df['date'] = weather_df['date'].astype(str)
combinations_df['operatingday'] = combinations_df['operatingday'].astype(str)

# Merge weather data into journey data
merged_df = combinations_df.merge(
    weather_df,
    left_on='operatingday',
    right_on='date',
    how='left'
)

# Drop the redundant 'date' column
merged_df = merged_df.drop(columns=['date'])

# Save the merged dataframe
merged_df.to_csv('./input/line011_combinations_with_weather.csv', index=False)

merged_df.tail(20)