In [None]:
# Complete Improved Code for Time Series Forecasting Models
# Fixed all issues and optimized for CPU Usage prediction

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from datetime import datetime

# ML and Time Series Libraries
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor

# Deep Learning Libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

warnings.filterwarnings('ignore')
plt.style.use('default')

print("=== Azure Cloud Resource Forecasting Analysis ===")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)

# ==========================================
# UTILITY FUNCTIONS
# ==========================================

def evaluate_metrics(y_true, y_pred, model_name="Model"):
    """Calculate RMSE, MAE, and MAPE for model evaluation"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    # Handle MAPE calculation to avoid division by zero
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true != 0, y_true, 1))) * 100
    
    print(f"{model_name} Metrics:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  MAPE: {mape:.2f}%")
    
    return rmse, mae, mape

def plot_predictions_with_dates(dates, y_true, y_pred, title, save_path=None):
    """Enhanced plotting function with dates on x-axis"""
    plt.figure(figsize=(14, 7))
    
    # Ensure dates are in datetime format
    if not isinstance(dates, pd.DatetimeIndex):
        dates = pd.to_datetime(dates)
    
    plt.plot(dates, y_true, label='Actual CPU Usage', color='#2E86AB', linewidth=2.5, alpha=0.8)
    plt.plot(dates, y_pred, label='Predicted CPU Usage', color='#F24236', linestyle='--', linewidth=2.5, alpha=0.9)
    
    plt.title(title, fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Date', fontsize=14, fontweight='bold')
    plt.ylabel('CPU Usage (%)', fontsize=14, fontweight='bold')
    plt.legend(fontsize=12, loc='best')
    plt.grid(True, alpha=0.3, linestyle=':', linewidth=0.8)
    plt.xticks(rotation=45, fontsize=11)
    plt.yticks(fontsize=11)
    
    # Add some styling
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"  Plot saved: {save_path}")
    
    plt.show()

def create_directory(path):
    """Create directory if it doesn't exist"""
    os.makedirs(path, exist_ok=True)
    return path

# ==========================================
# DATA PREPROCESSING
# ==========================================

def load_and_prepare_data(file_path):
    """Load and prepare the dataset with feature engineering"""
    print("1. Loading and preparing data...")
    
    # Load the data
    df = pd.read_csv(file_path)
    print(f"   Dataset shape: {df.shape}")
    
    # Convert date and sort
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    
    # Add derived features for CPU and Storage efficiency
    print("   Adding derived features...")
    max_cpu_per_resource = df.groupby('resource_type')['usage_cpu'].transform('max')
    df['cpu_utilization'] = df['usage_cpu'] / max_cpu_per_resource
    
    max_storage_per_resource = df.groupby('resource_type')['usage_storage'].transform('max')
    df['storage_efficiency'] = df['usage_storage'] / max_storage_per_resource
    
    # Add time-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # One-hot encode categorical features
    df_encoded = pd.get_dummies(df, columns=['region', 'resource_type'], prefix=['region', 'resource_type'])
    
    print(f"   Final dataset shape: {df_encoded.shape}")
    return df_encoded, df

# ==========================================
# ARIMA MODEL
# ==========================================

def prepare_arima_data(df):
    """Prepare data specifically for ARIMA model (univariate time series)"""
    print("\n2. Preparing ARIMA dataset...")
    
    # Aggregate CPU usage by date (average across all regions and resource types)
    arima_df = df.groupby('date').agg({
        'usage_cpu': 'mean'
    }).reset_index()
    
    # Set date as index for time series analysis
    arima_df.set_index('date', inplace=True)
    
    print(f"   ARIMA dataset shape: {arima_df.shape}")
    return arima_df

def train_arima_model(arima_df, test_size=0.2):
    """Train and evaluate ARIMA model"""
    print("\n3. Training ARIMA Model...")
    
    # Split data
    train_size = int(len(arima_df) * (1 - test_size))
    train_arima = arima_df[:train_size]
    test_arima = arima_df[train_size:]
    
    print(f"   Train size: {len(train_arima)}, Test size: {len(test_arima)}")
    
    try:
        # Fit ARIMA model with optimized parameters
        model_arima = sm.tsa.ARIMA(train_arima['usage_cpu'], order=(2,1,2))
        model_arima_fit = model_arima.fit()
        
        # Forecast
        forecast_arima = model_arima_fit.forecast(steps=len(test_arima))
        forecast_arima.index = test_arima.index
        
        # Evaluate
        rmse, mae, mape = evaluate_metrics(test_arima['usage_cpu'], forecast_arima, "ARIMA")
        
        # Plot
        plot_predictions_with_dates(
            test_arima.index, 
            test_arima['usage_cpu'], 
            forecast_arima, 
            'ARIMA Model: Actual vs Predicted CPU Usage'
        )
        
        return model_arima_fit, forecast_arima, test_arima, (rmse, mae, mape)
        
    except Exception as e:
        print(f"   ARIMA model failed: {str(e)}")
        return None, None, None, (None, None, None)

# ==========================================
# XGBOOST MODEL
# ==========================================

def train_xgboost_model(df_encoded, test_size=0.2):
    """Train and evaluate XGBoost model"""
    print("\n4. Training XGBoost Model...")
    
    # Prepare features and target
    feature_cols = [col for col in df_encoded.columns if col not in ['date', 'usage_cpu']]
    
    X = df_encoded[feature_cols]
    y = df_encoded['usage_cpu']
    dates = df_encoded['date']
    
    # Split data (temporal split)
    train_size = int(len(df_encoded) * (1 - test_size))
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    test_dates = dates[train_size:]
    
    print(f"   Train size: {len(X_train)}, Test size: {len(X_test)}")
    print(f"   Number of features: {len(feature_cols)}")
    
    # Train XGBoost model
    model_xgb = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    
    model_xgb.fit(X_train, y_train)
    preds_xgb = model_xgb.predict(X_test)
    
    # Evaluate
    rmse, mae, mape = evaluate_metrics(y_test, preds_xgb, "XGBoost")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model_xgb.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n   Top 10 Most Important Features:")
    for idx, row in feature_importance.head(10).iterrows():
        print(f"     {row['feature']}: {row['importance']:.4f}")
    
    # Plot
    plot_predictions_with_dates(
        test_dates.reset_index(drop=True), 
        y_test.reset_index(drop=True), 
        preds_xgb, 
        'XGBoost Model: Actual vs Predicted CPU Usage'
    )
    
    return model_xgb, preds_xgb, y_test, test_dates, (rmse, mae, mape), feature_importance

# ==========================================
# LSTM MODEL
# ==========================================

def create_lstm_sequences(data, feature_cols, target_col, dates, seq_length=7):
    """Create sequences for LSTM model"""
    X, y, sequence_dates = [], [], []
    
    for i in range(len(data) - seq_length):
        X.append(data[feature_cols].iloc[i:i+seq_length].values)
        y.append(data[target_col].iloc[i+seq_length])
        sequence_dates.append(dates.iloc[i+seq_length])
    
    return np.array(X), np.array(y), sequence_dates

def train_lstm_model(df_encoded, test_size=0.2, seq_length=7):
    """Train and evaluate LSTM model"""
    print("\n5. Training LSTM Model...")
    
    # Prepare features
    feature_cols = [col for col in df_encoded.columns if col not in ['date', 'usage_cpu']]
    
    # Scale the features
    scaler = MinMaxScaler()
    df_scaled = df_encoded.copy()
    df_scaled[feature_cols + ['usage_cpu']] = scaler.fit_transform(df_scaled[feature_cols + ['usage_cpu']])
    
    # Create sequences
    X_lstm, y_lstm, sequence_dates = create_lstm_sequences(
        df_scaled, feature_cols, 'usage_cpu', df_encoded['date'], seq_length
    )
    
    print(f"   Sequence shape: {X_lstm.shape}")
    print(f"   Number of sequences: {len(X_lstm)}")
    
    # Split data
    train_size = int(len(X_lstm) * (1 - test_size))
    X_train_lstm = X_lstm[:train_size]
    X_test_lstm = X_lstm[train_size:]
    y_train_lstm = y_lstm[:train_size]
    y_test_lstm = y_lstm[train_size:]
    test_dates_lstm = sequence_dates[train_size:]
    
    print(f"   Train sequences: {len(X_train_lstm)}, Test sequences: {len(X_test_lstm)}")
    
    # Build LSTM model
    model_lstm = Sequential([
        LSTM(64, activation='relu', return_sequences=True, input_shape=(seq_length, len(feature_cols))),
        Dropout(0.2),
        LSTM(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    
    model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    # Train model
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    history = model_lstm.fit(
        X_train_lstm, y_train_lstm,
        validation_split=0.2,
        epochs=100,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )
    
    # Predict
    preds_lstm_scaled = model_lstm.predict(X_test_lstm, verbose=0).flatten()
    
    # Inverse scale predictions and actual values
    y_test_original = scaler.inverse_transform(
        np.column_stack([np.zeros((len(y_test_lstm), len(feature_cols))), y_test_lstm])
    )[:, -1]
    
    preds_lstm_original = scaler.inverse_transform(
        np.column_stack([np.zeros((len(preds_lstm_scaled), len(feature_cols))), preds_lstm_scaled])
    )[:, -1]
    
    # Evaluate
    rmse, mae, mape = evaluate_metrics(y_test_original, preds_lstm_original, "LSTM")
    
    # Plot
    plot_predictions_with_dates(
        test_dates_lstm, 
        y_test_original, 
        preds_lstm_original, 
        'LSTM Model: Actual vs Predicted CPU Usage'
    )
    
    return model_lstm, preds_lstm_original, y_test_original, test_dates_lstm, (rmse, mae, mape), history

# ==========================================
# MODEL COMPARISON
# ==========================================

def compare_models(arima_metrics, xgb_metrics, lstm_metrics):
    """Compare all three models"""
    print("\n" + "="*60)
    print("MODEL COMPARISON SUMMARY")
    print("="*60)
    
    models = ['ARIMA', 'XGBoost', 'LSTM']
    metrics = [arima_metrics, xgb_metrics, lstm_metrics]
    
    comparison_df = pd.DataFrame({
        'Model': models,
        'RMSE': [m[0] if m[0] is not None else np.nan for m in metrics],
        'MAE': [m[1] if m[1] is not None else np.nan for m in metrics],
        'MAPE': [m[2] if m[2] is not None else np.nan for m in metrics]
    })
    
    print(comparison_df.to_string(index=False, float_format='{:.4f}'.format))
    
    # Find best model for each metric
    if not comparison_df['RMSE'].isna().all():
        best_rmse = comparison_df.loc[comparison_df['RMSE'].idxmin(), 'Model']
        best_mae = comparison_df.loc[comparison_df['MAE'].idxmin(), 'Model']
        best_mape = comparison_df.loc[comparison_df['MAPE'].idxmin(), 'Model']
        
        print(f"\nBest Models:")
        print(f"  RMSE: {best_rmse}")
        print(f"  MAE:  {best_mae}")
        print(f"  MAPE: {best_mape}")
    
    return comparison_df

# ==========================================
# MAIN EXECUTION
# ==========================================

def main():
    """Main execution function"""
    
    # File paths - Update these paths according to your directory structure
    data_file = 'D:/infosysspringboard projects/project1-1stmilestine/AZURE_BACKEND_TEAM-B/data/processed/cleaned_merged.csv'  # Update this path
    
    # Check if file exists``
    if not os.path.exists(data_file):
        print(f"Error: Data file '{data_file}' not found!")
        print("Please update the 'data_file' variable with the correct path.")
        return
    
    try:
        # Load and prepare data
        df_encoded, df_original = load_and_prepare_data(data_file)
        
        # Train ARIMA model
        arima_df = prepare_arima_data(df_original)
        arima_model, arima_forecast, arima_test, arima_metrics = train_arima_model(arima_df)
        
        # Train XGBoost model
        xgb_model, xgb_preds, xgb_test, xgb_dates, xgb_metrics, xgb_importance = train_xgboost_model(df_encoded)
        
        # Train LSTM model
        lstm_model, lstm_preds, lstm_test, lstm_dates, lstm_metrics, lstm_history = train_lstm_model(df_encoded)
        
        # Compare models
        comparison_df = compare_models(arima_metrics, xgb_metrics, lstm_metrics)
        
        print("\n" + "="*60)
        print("ANALYSIS COMPLETED SUCCESSFULLY!")
        print(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("="*60)
        
        # Return results for further use
        return {
            'arima': {'model': arima_model, 'metrics': arima_metrics},
            'xgboost': {'model': xgb_model, 'metrics': xgb_metrics, 'feature_importance': xgb_importance},
            'lstm': {'model': lstm_model, 'metrics': lstm_metrics, 'history': lstm_history},
            'comparison': comparison_df
        }
        
    except Exception as e:
        print(f"\nError during execution: {str(e)}")
        print("Please check your data file path and ensure all required libraries are installed.")
        return None

# ==========================================
# USAGE INSTRUCTIONS
# ==========================================

"""
USAGE INSTRUCTIONS:
1. Update the 'data_file' path in the main() function
2. Ensure you have the following libraries installed:
   - pandas, numpy, matplotlib, seaborn
   - scikit-learn, xgboost
   - statsmodels
   - tensorflow
3. Run the script: results = main()

The script will:
- Load and preprocess your data
- Train ARIMA, XGBoost, and LSTM models
- Evaluate each model with RMSE, MAE, MAPE
- Create visualizations comparing actual vs predicted values
- Provide a summary comparison of all models

For Jupyter Notebook usage:
- Copy individual functions as needed
- Run main() to execute the complete analysis
"""

if __name__ == "__main__":
    results = main()

=== Azure Cloud Resource Forecasting Analysis ===
Analysis started at: 2025-09-22 14:32:57
Error: Data file 'cleaned_merged.csv' not found!
Please update the 'data_file' variable with the correct path.
