In [None]:
# Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [None]:
# Load and preprocess date exp
combinations_df = pd.read_csv('../input/line401_combinations_exp.csv', usecols=['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'target_delay', 'day_type'], dtype={'to_station': 'bool', 'current_stop_index': np.int8, 'current_delay': np.float64, 'target_stop_index': np.int8, 'target_delay': np.float16, 'day_type': np.int8})
X = combinations_df[['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'day_type']]
y = combinations_df['target_delay']


In [None]:
# Load and preprocess data (with day_type and weather)
combinations_df = pd.read_csv('../input/line401_combinations_with_weather.csv', usecols=['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'target_delay', 'day_type', 'FR/windspeed (0.1 m/s)', 'TG/temperature (0.1 째C)', 'RH/precipitation (0.1mm)'], dtype={'to_station': 'bool', 'current_stop_index': np.int8, 'current_delay': np.float64, 'target_stop_index': np.int8, 'target_delay': np.float16, 'day_type': np.int8, 'FR/windspeed (0.1 m/s)': np.float16, 'TG/temperature (0.1 째C)': np.float16, 'RH/precipitation (0.1mm)': np.float16})
X = combinations_df[['to_station', 'current_stop_index', 'current_delay', 'target_stop_index', 'day_type', 'FR/windspeed (0.1 m/s)', 'TG/temperature (0.1 째C)', 'RH/precipitation (0.1mm)']]
y = combinations_df['target_delay']


In [None]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

#### Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

# Evaluate model
y_pred = model.predict(X_test).flatten()
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Calculate NRMSE (Normalized RMSE, normalized by the range of y_test)
nrmse = rmse / (np.max(y_test) - np.min(y_test))

# Calculate SMAPE (Symmetric Mean Absolute Percentage Error)
smape = 100 * np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_test) + np.abs(y_pred) + 1e-8))

# Calculate 90th Percentile Absolute Error (P90 AE)
p90_ae = np.percentile(np.abs(y_test - y_pred), 90)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")
# print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
# print(f"Symmetric MAPE (SMAPE): {smape:.2f}%")
print(f"Normalized RMSE (NRMSE): {nrmse:.4f}")
print(f"90th Percentile Absolute Error (P90 AE): {p90_ae:.2f}")


In [None]:
print(X_test['current_delay'].unique())
print(X_test['current_delay'].dtype)

In [None]:
# Debug predictions and features
print('Sample y_test:', y_test[:10].values)
print('Sample y_pred:', y_pred[:10])
print('Any inf in y_pred?', np.isinf(y_pred).any())
print('Any NaN in y_pred?', np.isnan(y_pred).any())
print('Max/min y_pred:', np.max(y_pred), np.min(y_pred))
print('Max/min y_test:', np.max(y_test), np.min(y_test))
print('X_test describe:')
print(X_test.describe())

#### Coefficients and Feature Importance

In [None]:
# Display coefficients
coefficients = pd.Series(model.coef_, index=X_train.columns)
print(coefficients.sort_values(ascending=False))

#### Example predictions

In [None]:
# Example predictions: Glaspoort to Piazza and Evoluon to Piazza
X_new = pd.DataFrame(0, index=[0], columns=X_train.columns)
X_new['current_stop_index'] = 13
X_new['current_delay'] = -60
X_new['target_stop_index'] = 15
X_new['to_station'] = 1
y_new_pred = model.predict(X_new)
print(f'Glaspoort to Piazza delay (LR): {y_new_pred[0]:.2f}')

X_evoluon = pd.DataFrame(0, index=[0], columns=X_train.columns)
X_evoluon['current_stop_index'] = 10
X_evoluon['current_delay'] = 120
X_evoluon['target_stop_index'] = 15
X_evoluon['to_station'] = 1
y_evoluon_pred = model.predict(X_evoluon)
print(f'Evoluon to Piazza delay (LR): {y_evoluon_pred[0]:.2f}')

In [None]:
# Calculate and print predicted delays for all downstream stops starting from stop 11 (Evoluon)
start_stop = 10  # Evoluon
final_stop = 16  # Eindhoven central station
current_delay = 120  # Current delay at Evoluon

downstream_results = []
for target_stop in range(start_stop + 1, final_stop + 1):
    X_downstream = pd.DataFrame(0, index=[0], columns=X_train.columns)
    X_downstream['current_stop_index'] = start_stop
    X_downstream['current_delay'] = current_delay
    X_downstream['target_stop_index'] = target_stop
    X_downstream['to_station'] = 1
    X_downstream['day_type'] = 0 # monday 
    X_downstream['FR/windspeed (0.1 m/s)'] = 0
    X_downstream['TG/temperature (0.1 째C)'] = 200
    X_downstream['RH/precipitation (0.1mm)'] = 0
    y_pred_downstream = model.predict(X_downstream)
    downstream_results.append((target_stop, y_pred_downstream[0]))
    print(f"Predicted delay from stop {start_stop} to stop {target_stop} (LR): {y_pred_downstream[0]:.2f}")

# Optionally, print all results as a summary
df_downstream = pd.DataFrame(downstream_results, columns=["target_stop_index", "predicted_delay"])
display(df_downstream)