In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pmdarima import auto_arima

In [2]:
# Load data
timeseries_data = pd.read_csv('/pfs/work9/workspace/scratch/ma_tofuchs-GraphWave-Seminar/Datasets/Mannheim/imputed_dataset_grin_full.csv', index_col=0)

In [3]:
# Compute split indices
num_samples = len(timeseries_data)
train_size = int(num_samples * 0.7)
val_size = int(num_samples * 0.1)

# Split the data
train_data = timeseries_data.iloc[:train_size]
val_data = timeseries_data.iloc[train_size: train_size + val_size]
test_data = timeseries_data.iloc[train_size + val_size:]

# Combine train_data and val_data
#train_data = pd.concat([train_data, val_data], ignore_index=False)

In [None]:
# Function to process each sensor's data
def process_sensor(sensor_id, sensor_series):
    try:
        print(f'Starting with sensor {sensor_id}')
        
        # Ensure it's a time series
        sensor_series = sensor_series.asfreq('h').interpolate()  # Ensure hourly frequency
        
        # Train-test split
        train, test = sensor_series[:len(train_data)], sensor_series[len(train_data):]

        # Apply auto_arima
        model = auto_arima(
            train, 
            start_p=1, start_q=1,
            max_p=3, max_q=3, 
            d=1,
            seasonal=True,
            m=24,
            start_P=0, start_Q=0, 
            max_P=3, max_Q=3, 
            D=1,
            trace=False,
            error_action='ignore', 
            suppress_warnings=True,
            stepwise=True
        )

        # Forecast using best model
        forecast = model.predict(n_periods=len(test))

        # Compute error metrics
        mae = mean_absolute_error(test, forecast)
        mape = np.mean(np.abs((test.values - forecast) / test.values)) * 100
        rmse = np.sqrt(mean_squared_error(test, forecast))

    except Exception as e:
        print(f"Error processing sensor {sensor_id}: {e}")
        return {'sensor_id': sensor_id, 'MAE': None, 'MAPE(%)': None, 'RMSE': None, 'Best_ARIMA_Order': None, 'Best_Seasonal_Order': None}
    
    # Return results
    return {
        'sensor_id': sensor_id,
        'MAE': mae,
        'MAPE(%)': mape,
        'RMSE': rmse,
        'Best_ARIMA_Order': model.order,
        'Best_Seasonal_Order': model.seasonal_order
    }

In [5]:
# Process all sensors
results = []
for sensor_id in timeseries_data.columns:
    sensor_series = timeseries_data[sensor_id]
    result = process_sensor(sensor_id, sensor_series)
    results.append(result)

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('auto-arima_results.csv', index=False)
print("Processing complete. Results saved to auto-arima_results.csv")

Starting with sensor  21649702                
Error processing sensor  21649702                : Input contains NaN.
Starting with sensor  21649767                
Error processing sensor  21649767                : Input contains NaN.
Starting with sensor  21673581               
Error processing sensor  21673581               : Input contains NaN.
Starting with sensor  21732938                
Error processing sensor  21732938                : Input contains NaN.
Starting with sensor  24555545                
Error processing sensor  24555545                : Input contains NaN.
Starting with sensor  25117313                
Error processing sensor  25117313                : Input contains NaN.
Starting with sensor  25117359                
Error processing sensor  25117359                : Input contains NaN.
Starting with sensor  27088492                
Error processing sensor  27088492                : Input contains NaN.
Starting with sensor  27428132               
Error proces



Error processing sensor  268591025                : Input contains NaN.
Starting with sensor  282260231                
Error processing sensor  282260231                : Input contains NaN.
Starting with sensor  382656460                
Error processing sensor  382656460                : Input contains NaN.
Starting with sensor  883463116                
Error processing sensor  883463116                : Input contains NaN.
Starting with sensor  988747917                
Error processing sensor  988747917                : Input contains NaN.
Starting with sensor  1176328666                
Error processing sensor  1176328666                : Input contains NaN.
Starting with sensor  1184512491                
Error processing sensor  1184512491                : Input contains NaN.
Starting with sensor  1633105688                
Error processing sensor  1633105688                : Input contains NaN.
Starting with sensor  1652156206                
Error processing sensor  16521562



In [6]:
# Check if any NaN values exist
print(test_data.isna().any().any())  # Returns True if there are NaNs

False
