In [17]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import glob
import numpy as np

In [23]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.stats import zscore

DATASET_DIR = "/home/ujx4ab/ondemand/dissecting_dist_inf/WF_Data/EDP/EDP_Model_Testing/EDP_filtered"
RESULTS = {}

for file_path in glob.glob(os.path.join(DATASET_DIR, "*.csv")):
    file_name = os.path.basename(file_path)
    df = pd.read_csv(file_path, index_col=0)  # Fix index column
    
    # Initialize results storage for this file
    file_results = {}
    
    # ---------------------
    # 1. Data Quality Checks
    # ---------------------
    file_results['missing_values'] = df.isnull().sum().to_dict()
    file_results['duplicate_rows'] = df.duplicated().sum()
    
    # ---------------------
    # 2. Temporal Consistency
    # ---------------------
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    time_diffs = df['Timestamp'].diff().dt.total_seconds() / 60  # Convert to minutes
    expected_interval = 10  # minutes
    file_results['incorrect_intervals'] = {
        'num_entries': len(time_diffs) - 1,
        'num_incorrect': (np.abs(time_diffs - expected_interval) > 1e-3).sum(),
        'max_interval': time_diffs.max(),
        'min_interval': time_diffs.min()
    }
    
    # ---------------------
    # 3. Statistical Summary
    # ---------------------
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    file_results['statistics'] = df[numeric_cols].describe().to_dict()
    
    # ---------------------
    # 4. Physical Plausibility
    # ---------------------
    range_checks = {
        'Power_negative': (df['Power'] < 0).sum(),
        'Wind_speed_negative': (df['Wind_speed'] < 0).sum(),
        'Gen_speed_negative': (df['Gen_speed'] < 0).sum(),
        'Temp_impossible': (
            (df['Amb_temp'] < -40) | 
            (df['Amb_temp'] > 60) |
            (df['Gear_bear_temp'] > 120)
        ).sum()
    }
    file_results['range_violations'] = range_checks
    
    # ---------------------
    # 5. Variable Relationships
    # ---------------------
    correlation_matrix = df[['Wind_speed', 'Power', 'Gen_speed', 'Rotor_speed']].corr()
    file_results['correlations'] = correlation_matrix.to_dict()
    
    # ---------------------
    # 6. Outlier Detection
    # ---------------------
    z_scores = df[numeric_cols].apply(zscore)
    file_results['outliers'] = {
        'zscore_3': (np.abs(z_scores) > 3).sum().to_dict(),
        'zscore_5': (np.abs(z_scores) > 5).sum().to_dict()
    }
    
    RESULTS[file_name] = file_results

# Optional: Save results for analysis
import json
with open("data_quality_report.json", "w") as f:
    json.dump(RESULTS, f, indent=2)

TypeError: Object of type int64 is not JSON serializable

In [16]:
def train_and_evaluate(df, file_path):
    """Train and evaluate a Random Forest model on given DataFrame."""
    if 'Power' not in df.columns:
        raise ValueError(f"'Power' column not found in dataset")
    
    X = df.drop(columns=['Power'])
    y = df['Power']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nProcessing: {os.path.basename(file_path)}")
    print("Data shapes:")
    print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"  X_test:  {X_test.shape}, y_test:  {y_test.shape}")
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)  # Changed line
    rmse = np.sqrt(mse)  # Changed line
    
    return {'MAE': mae, 'RMSE': rmse}

In [18]:
# Configuration
DATASET_DIR = "/home/ujx4ab/ondemand/dissecting_dist_inf/WF_Data/EDP/EDP_Model_Testing/EDP_filtered"
RESULTS = {}

# Process all CSV files in directory
for file_path in glob.glob(os.path.join(DATASET_DIR, "*.csv")):
    try:
        # Load and preprocess data
        df = pd.read_csv(file_path)
        
        # Remove unnecessary columns if present
        df.drop(
            columns=[c for c in ['Unnamed: 0', 'Timestamp'] if c in df.columns],
            inplace=True
        )
        
        # Train model and store results
        metrics = train_and_evaluate(df, file_path)
        RESULTS[os.path.basename(file_path)] = metrics
        print(f"  Metrics: MAE={metrics['MAE']:.4f}, RMSE={metrics['RMSE']:.4f}")
        
    except ValueError as e:
        print(f"  Error: {str(e)} - Skipping file")
    except Exception as e:
        print(f"  Unexpected error processing file: {str(e)}")

# Display final results
print("\nModel Performance Summary:")
for dataset, metrics in RESULTS.items():
    print(f"{dataset}:")
    print(f"  MAE:  {metrics['MAE']:.4f}")
    print(f"  RMSE: {metrics['RMSE']:.4f}\n")


Processing: WT_06.csv
Data shapes:
  X_train: (82336, 22), y_train: (82336,)
  X_test:  (20585, 22), y_test:  (20585,)
  Metrics: MAE=2330.4144, RMSE=4850.9446

Processing: WT_01.csv
Data shapes:
  X_train: (83746, 22), y_train: (83746,)
  X_test:  (20937, 22), y_test:  (20937,)
  Metrics: MAE=2398.7296, RMSE=4419.0471

Processing: WT_11.csv
Data shapes:
  X_train: (83838, 22), y_train: (83838,)
  X_test:  (20960, 22), y_test:  (20960,)
  Metrics: MAE=2925.2698, RMSE=4887.1763

Processing: WT_07.csv
Data shapes:
  X_train: (83791, 22), y_train: (83791,)
  X_test:  (20948, 22), y_test:  (20948,)
  Metrics: MAE=2499.3922, RMSE=4691.9689

Model Performance Summary:
WT_06.csv:
  MAE:  2330.4144
  RMSE: 4850.9446

WT_01.csv:
  MAE:  2398.7296
  RMSE: 4419.0471

WT_11.csv:
  MAE:  2925.2698
  RMSE: 4887.1763

WT_07.csv:
  MAE:  2499.3922
  RMSE: 4691.9689

