In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib
import matplotlib.pyplot as plt
# matplotlib.use('Agg')
%matplotlib inline
import os
import sys
from tqdm import tqdm

In [8]:
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '../../..'))
sys.path.append(project_root)
from backend.ml.data.data_preprocessing import WeatherDataPreprocessor


In [9]:
dataset_path = os.path.join(project_root, 'backend', 'ml', 'data', 'weather_dataset.csv')
models_path = os.path.join(project_root, 'backend', 'ml', 'models', 'weather_models.joblib')

In [10]:
class WeatherPredictor:
    def __init__(self):
        self.model = RandomForestRegressor(
            n_estimators=100,
            max_depth=20,
            random_state=42,
            n_jobs=-1
        )
        self.preprocessor = WeatherDataPreprocessor()
        self.feature_list_for_scale = None

    def prepare_data(self, data_path):
        """Prepare data for training."""
        df = self.preprocessor.preprocess(data_path)
        
        targets = ['temperature', 'humidity', 'wind_speed', 'pressure', 
                  'precipitation', 'cloud', 'uv_index', 'visibility', 
                  'rain_probability', 'dewpoint', 'gust_speed', 'snow_probability',
                  'condition_code', 'wind_direction']

        X = df.drop(targets, axis=1)
        y_dict = {target: df[target] for target in targets}

        # for target in targets:
        #     print(f"Distribution of {target}:")
        #     print(df[target].describe())
            
        # Tạo feature_list_for_scale sau khi tách target
        feature_list_for_scale = [col for col in X.columns if col not in targets]

        return X, y_dict, feature_list_for_scale
    
    def train(self, X, y_dict, feature_list_for_scale):
        """Train models for each weather parameter with comprehensive metrics."""
        self.models = {}
        self.metrics = {}
        self.scalers = {}
        self.feature_importances = {}

        print("Training models...")
        for target_name, y in tqdm(y_dict.items()):
            print(f"Training model for {target_name}...")
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            # Scale data
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train[feature_list_for_scale])
            X_test_scaled = scaler.transform(X_test[feature_list_for_scale])
            
            self.scalers[target_name] = scaler

            # Train model
            model = RandomForestRegressor(
                n_estimators=500,
                max_depth=20,
                random_state=42,
                n_jobs=-1
            )
            model.fit(X_train_scaled, y_train)
            
            # Predictions
            y_pred = model.predict(X_test_scaled)
            
            # Calculate detailed metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)
            mae = np.mean(np.abs(y_test - y_pred))
            mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100 if not np.any(y_test == 0) else np.nan
            
            # Store feature importance
            self.feature_importances[target_name] = dict(zip(feature_list_for_scale, model.feature_importances_))
            
            self.models[target_name] = model
            self.metrics[target_name] = {
                'rmse': rmse,
                'r2': r2,
                'mse': mse,
                'mae': mae,
                'mape': mape if not np.isnan(mape) else "N/A (contains zero values)"
            }
            
            # Plot predictions
            self.plot_predictions(y_test, y_pred, target_name)
        
        print("Training complete.")
        
        # Print comprehensive evaluation metrics
        self.print_evaluation_metrics()
        
        return self.metrics
    
    def plot_predictions(self, y_true, y_pred, target_name):
        """Plot actual vs predicted values and display in notebook."""
        # Nếu target_name là một chuỗi (chỉ một mục tiêu), chuyển nó thành danh sách
        if isinstance(target_name, str):
            target_names = [target_name]
        else:
            target_names = target_name
            
        num_plots = len(target_names)
        num_cols = 3  # Số cột mỗi hàng là 3
        num_rows = (num_plots + num_cols - 1) // num_cols  # Tính số hàng

        fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
        axes = axes.flatten()

        for idx, target in enumerate(target_names):
            y_true_target = y_true[target]
            y_pred_target = y_pred[target]
            
            # Vẽ biểu đồ cho mỗi target
            ax = axes[idx]
            ax.scatter(y_true_target, y_pred_target, alpha=0.5, c='blue', s=10, label='Predicted vs Actual')
            ax.plot([y_true_target.min(), y_true_target.max()], [y_true_target.min(), y_true_target.max()], 'r--', lw=2, label='Ideal Fit')
            ax.set_xlabel('Actual Values')
            ax.set_ylabel('Predicted Values')
            ax.set_title(f'Actual vs Predicted {target}')
            ax.legend()
            ax.grid(True)

        # Loại bỏ các axes thừa nếu không cần thiết
        for idx in range(num_plots, len(axes)):
            fig.delaxes(axes[idx])

        # Lưu và hiển thị biểu đồ
        save_dir = os.path.join(project_root, 'backend/ml/models/rating_chart1')
        os.makedirs(save_dir, exist_ok=True)
        plt.tight_layout()  # Cải thiện layout
        plt.savefig(f'{save_dir}/predictions_all_targets.png')
        plt.show()
        plt.close()
    
    def save_models(self, path=None):
        """Save trained models and ensure directory exists."""
        if path is None:
            # Use default path if none provided
            path = os.path.join(project_root, 'backend', 'ml', 'models', 'weather_models.joblib')
        
        # Ensure the directory exists
        os.makedirs(os.path.dirname(path), exist_ok=True)
        
        if self.feature_list_for_scale is None:
            # Cập nhật danh sách đặc trưng nếu chưa được gán
            self.feature_list_for_scale = [
                'hour', 'day', 'month', 'day_of_week', 'day_of_year',
                'temperature_lag_1', 'temperature_lag_2', 'temperature_lag_3',
                'temperature_rolling_mean_3', 'temperature_rolling_mean_6', 'temperature_rolling_mean_12',
                'humidity_lag_1', 'humidity_lag_2', 'humidity_lag_3',
                'humidity_rolling_mean_3', 'humidity_rolling_mean_6', 'humidity_rolling_mean_12',
                'wind_speed_lag_1', 'wind_speed_lag_2', 'wind_speed_lag_3',
                'wind_speed_rolling_mean_3', 'wind_speed_rolling_mean_6', 'wind_speed_rolling_mean_12',
                'pressure_lag_1', 'pressure_lag_2', 'pressure_lag_3',
                'pressure_rolling_mean_3', 'pressure_rolling_mean_6', 'pressure_rolling_mean_12',
                'precipitation_lag_1', 'precipitation_lag_2', 'precipitation_lag_3',
                'precipitation_rolling_mean_3', 'precipitation_rolling_mean_6', 'precipitation_rolling_mean_12',
                'cloud_lag_1', 'cloud_lag_2', 'cloud_lag_3',
                'cloud_rolling_mean_3', 'cloud_rolling_mean_6', 'cloud_rolling_mean_12',
                'uv_index_lag_1', 'uv_index_lag_2', 'uv_index_lag_3',
                'uv_index_rolling_mean_3', 'uv_index_rolling_mean_6', 'uv_index_rolling_mean_12',
                'visibility_lag_1', 'visibility_lag_2', 'visibility_lag_3',
                'visibility_rolling_mean_3', 'visibility_rolling_mean_6', 'visibility_rolling_mean_12',
                'rain_probability_lag_1', 'rain_probability_lag_2', 'rain_probability_lag_3',
                'rain_probability_rolling_mean_3', 'rain_probability_rolling_mean_6', 'rain_probability_rolling_mean_12',
                'dewpoint_lag_1', 'dewpoint_lag_2', 'dewpoint_lag_3',
                'dewpoint_rolling_mean_3', 'dewpoint_rolling_mean_6', 'dewpoint_rolling_mean_12',
                'airport_code_encoded'
            ]
            print("Updated 'feature_list_for_scale' before saving the model.")
        
        # Lưu mô hình và thông tin liên quan
        joblib.dump({
            'models': self.models, 
            'scalers': self.scalers, 
            'feature_list_for_scale': self.feature_list_for_scale
        }, path, compress=4)
        
        print(f"Models and feature list saved successfully to {path}.")
    
    def load_models(self, path=models_path):
        """Load trained models."""
        data = joblib.load(path)
        self.models = data['models']
        self.scalers = data['scalers']
        self.feature_list_for_scale = data['feature_list_for_scale']
        
    def print_evaluation_metrics(self):
        """Print comprehensive evaluation metrics for all models."""
        print("\n" + "="*80)
        print(f"{'MODEL EVALUATION METRICS':^80}")
        print("="*80)
        print(f"{'Target':<15} | {'RMSE':^12} | {'MAE':^12} | {'R²':^12} | {'MAPE (%)':^12}")
        print("-"*80)
        
        for target, metrics in self.metrics.items():
            rmse = f"{metrics['rmse']:.4f}" if isinstance(metrics['rmse'], (int, float)) else metrics['rmse']
            mae = f"{metrics['mae']:.4f}" if isinstance(metrics['mae'], (int, float)) else metrics['mae']
            r2 = f"{metrics['r2']:.4f}" if isinstance(metrics['r2'], (int, float)) else metrics['r2']
            mape = f"{metrics['mape']:.4f}" if isinstance(metrics['mape'], (int, float)) else metrics['mape']
            
            print(f"{target:<15} | {rmse:^12} | {mae:^12} | {r2:^12} | {mape:^12}")
        
        print("="*80)
        
        # Also print top 5 most important features for each model
        print("\n" + "="*80)
        print(f"{'TOP 5 MOST IMPORTANT FEATURES':^80}")
        print("="*80)
        
        for target, importances in self.feature_importances.items():
            print(f"\nModel: {target}")
            sorted_features = sorted(importances.items(), key=lambda x: x[1], reverse=True)[:5]
            for i, (feature, importance) in enumerate(sorted_features, 1):
                print(f"{i}. {feature:<20}: {importance:.4f}")
    
    def generate_summary_report(self, output_path=None):
        """Generate a comprehensive summary report of model performance."""
        if output_path is None:
            output_path = os.path.join(project_root, 'backend', 'ml', 'models', 'model_evaluation_report.txt')
        
        with open(output_path, 'w') as f:
            f.write("="*80 + "\n")
            f.write(f"{'WEATHER PREDICTION MODEL EVALUATION REPORT':^80}\n")
            f.write(f"{'Generated on: ' + pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'):^80}\n")
            f.write("="*80 + "\n\n")
            
            # Write model metrics
            f.write("MODEL PERFORMANCE METRICS\n")
            f.write("-"*80 + "\n")
            f.write(f"{'Target':<15} | {'RMSE':^12} | {'MAE':^12} | {'R²':^12} | {'MAPE (%)':^12}\n")
            f.write("-"*80 + "\n")
            
            for target, metrics in self.metrics.items():
                rmse = f"{metrics['rmse']:.4f}" if isinstance(metrics['rmse'], (int, float)) else metrics['rmse']
                mae = f"{metrics['mae']:.4f}" if isinstance(metrics['mae'], (int, float)) else metrics['mae']
                r2 = f"{metrics['r2']:.4f}" if isinstance(metrics['r2'], (int, float)) else metrics['r2']
                mape = f"{metrics['mape']:.4f}" if isinstance(metrics['mape'], (int, float)) else metrics['mape']
                
                f.write(f"{target:<15} | {rmse:^12} | {mae:^12} | {r2:^12} | {mape:^12}\n")
            
            f.write("\n\n")
            
            # Write feature importances
            f.write("FEATURE IMPORTANCE ANALYSIS\n")
            f.write("-"*80 + "\n")
            
            for target, importances in self.feature_importances.items():
                f.write(f"\nModel: {target}\n")
                sorted_features = sorted(importances.items(), key=lambda x: x[1], reverse=True)
                for i, (feature, importance) in enumerate(sorted_features, 1):
                    f.write(f"{i}. {feature:<20}: {importance:.4f}\n")
                f.write("\n")
        
        print(f"Summary report generated at: {output_path}")
        

In [11]:
def main():
    # Initialize predictor
    predictor = WeatherPredictor()
    
    # Prepare data
    X, y_dict, feature_list_for_scale = predictor.prepare_data(dataset_path)

    # Train models
    metrics = predictor.train(X, y_dict, feature_list_for_scale)
    
    # Print metrics
    for target, metric in metrics.items():
        print(f"{target} - RMSE: {metric['rmse']}, R²: {metric['r2']}")
    
    # Save models
    predictor.save_models()

In [None]:
if __name__ == "__main__":
    main()

Training models...


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

Training model for temperature...
