Evaluate model epoch 4 no embedding

# LSTM model

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

scaler_file = "../dataset/processed/scaler.pkl"

with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]

target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

timesteps = 24
batch_size = 256

test_file = "../dataset/processed/test_data.csv"

checkpoint_path = "../model/best_model2.h5"
model = load_model(checkpoint_path)
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        chunk = chunk.sort_values(by=["Datetime"])
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),
    )
).prefetch(tf.data.AUTOTUNE)

loss, mae = model.evaluate(test_dataset)
print(f"✅ Kết quả đánh giá trên dữ liệu test: Loss={loss:.4f}, MAE={mae:.4f}")


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable params: 130,960
Non-trai

# Fine-tune & CNN-LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

scaler_file = "../dataset/processed/scaler.pkl"
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

timesteps = 24
batch_size = 256

test_file = "../dataset/processed/test_data.csv"
checkpoint_path = "best_model_spatial.h5"
model = load_model(checkpoint_path)

print("Model: \"sequential\"")
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        chunk = chunk.sort_values(by=["Datetime", "Latitude", "Longitude"])
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),
    )
).prefetch(tf.data.AUTOTUNE)

print("\nĐang đánh giá mô hình trên tập test...")
loss, mae = model.evaluate(test_dataset, verbose=1)
print(f"✅ Kết quả đánh giá trên dữ liệu test: Loss={loss:.4f}, MAE={mae:.4f}")

Model: "sequential"
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable para

# Fine-tune LSTM

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

scaler_file = "../dataset/processed/scaler.pkl"
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

timesteps = 24
batch_size = 256

test_file = "../dataset/processed/test_data.csv"
checkpoint_path = "best_model_spatial.h5"
model = load_model(checkpoint_path)

print("Model: \"sequential\"")
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        chunk = chunk.sort_values(by=["Datetime", "Latitude", "Longitude"])
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32), 
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),
    )
).prefetch(tf.data.AUTOTUNE)

print("\nĐang đánh giá mô hình trên tập test...")
loss, mae = model.evaluate(test_dataset, verbose=1)
print(f"✅ Kết quả đánh giá trên dữ liệu test: Loss={loss:.4f}, MAE={mae:.4f}")

Model: "sequential"
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable para

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# ========================== LOAD & CLEAN DATA ==========================

def load_processed_data():
    df_day_20 = pd.read_csv('../dataset/real_weather_data.csv')
    df_day_21 = pd.read_csv('../dataset/real_weather_data.csv')
    return df_day_20, df_day_21

def add_time_features(df):
    df['Datetime'] = pd.to_datetime(df['Datetime'], errors='coerce')
    df['hour'] = df['Datetime'].dt.hour
    df['day'] = df['Datetime'].dt.day
    df['month'] = df['Datetime'].dt.month
    df['season'] = (df['month'] % 12 + 3) // 3
    return df

# ========================== LSTM INPUT ==========================

def create_lstm_input(df, time_steps=24):
    X, y = [], []
    features = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]

    for i in range(len(df) - time_steps - 24 + 1):
        X_window = df.iloc[i:i+time_steps][features].values
        y_window = df.iloc[i+time_steps:i+time_steps+24][features[:6]].values
        if y_window.shape[0] == 24:
            X.append(X_window)
            y.append(y_window)
    return np.array(X), np.array(y)

# ========================== FORECAST ==========================

def predict_with_model_for_lat_lon(df_day_20, lat, lon, scaler_X, scaler_y, model):
    df_lat_lon = df_day_20[(df_day_20['Latitude'] == lat) & (df_day_20['Longitude'] == lon)].copy()
    if len(df_lat_lon) == 0:
        print(f"No data for lat {lat}, lon {lon}")
        return None, None

    # Thêm cột thời gian
    df_lat_lon = add_time_features(df_lat_lon)
    
    # Đảm bảo cột PRECTOTCORR có mặt
    df_lat_lon = reorder_and_clean_columns(df_lat_lon)
    
    X_input, _ = create_lstm_input(df_lat_lon, time_steps=24)
    if X_input.shape[0] == 0:
        print(f"Not enough data for prediction at lat {lat}, lon {lon}")
        return None, None

    features = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]

    X_input_df = pd.DataFrame(X_input.reshape(-1, len(features)), columns=features)
    X_input_df = X_input_df[scaler_X.feature_names_in_]
    X_input_scaled = scaler_X.transform(X_input_df).reshape(X_input.shape)

    y_pred_scaled = model.predict(X_input_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled[-1])  # 24 dòng cuối

    forecast_start = df_lat_lon['Datetime'].iloc[-1] + pd.Timedelta(hours=1)
    forecast_times = pd.date_range(start=forecast_start, periods=24, freq='h')

    forecast_df = pd.DataFrame(y_pred, columns=[
        'Predicted_T2M', 'Predicted_QV2M', 'Predicted_PS', 'Predicted_WS10M',
        'Predicted_PRECTOTCORR', 'Predicted_CLRSKY_SFC_SW_DWN'
    ])
    forecast_df['Datetime'] = forecast_times
    forecast_df['Latitude'] = lat
    forecast_df['Longitude'] = lon

    return forecast_df, y_pred

def forecast_for_all_lat_lon(df_day_20, scaler_X, scaler_y):
    model = load_model('best_model_fine_tune.h5')
    available_coords = df_day_20[['Latitude', 'Longitude']].drop_duplicates().values
    forecast_results_list = []

    for lat, lon in available_coords:
        forecast_df, _ = predict_with_model_for_lat_lon(df_day_20, lat, lon, scaler_X, scaler_y, model)
        if forecast_df is not None:
            forecast_results_list.append(forecast_df)

    forecast_results = pd.concat(forecast_results_list, ignore_index=True)
    return forecast_results

# ========================== EVALUATE ==========================

def evaluate_model(df_day_21, forecast_results):
    df_day_21['Datetime'] = pd.to_datetime(df_day_21['Datetime'], errors='coerce')
    forecast_results['Datetime'] = pd.to_datetime(forecast_results['Datetime'], errors='coerce')

    df_day_21['Latitude'] = df_day_21['Latitude'].astype(np.float32)
    df_day_21['Longitude'] = df_day_21['Longitude'].astype(np.float32)
    forecast_results['Latitude'] = forecast_results['Latitude'].astype(np.float32)
    forecast_results['Longitude'] = forecast_results['Longitude'].astype(np.float32)

    merged = pd.merge(df_day_21, forecast_results,
                      on=['Datetime', 'Latitude', 'Longitude'],
                      how='inner', suffixes=('', '_pred'))

    if len(merged) == 0:
        print("No matching data for evaluation")
        return

    print(f"\n✅ Số lượng dòng được so sánh: {len(merged)}")
    print(merged[['Datetime', 'Latitude', 'Longitude'] + [f'Predicted_{c}' for c in ['T2M', 'QV2M']]].head())

    metrics = {}
    for col in ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"]:  # Sửa tên cột ở đây
        actual = merged[col]
        pred = merged[f'Predicted_{col}']
        metrics[col] = {
            'MAE': mean_absolute_error(actual, pred),
            'MSE': mean_squared_error(actual, pred),
            'R2': r2_score(actual, pred)
        }

    print("\n🔍 Chi tiết các chỉ số đánh giá:\n")
    df_metrics = pd.DataFrame(metrics).T
    df_metrics.columns = ['MAE', 'MSE', 'R2']
    print(df_metrics.round(4))

# ========================== MAIN ==========================

if __name__ == "__main__":
    df_day_20, df_day_21 = load_processed_data()
    df_day_20_unscaled = df_day_20.copy()

    with open('../dataset/scaler.pkl', 'rb') as f:
        scaler_X, scaler_y = pickle.load(f)

    forecast_results = forecast_for_all_lat_lon(df_day_20_unscaled, scaler_X, scaler_y)

    if len(forecast_results) > 0:
        forecast_results['Latitude'] = forecast_results['Latitude'].round(4)
        forecast_results['Longitude'] = forecast_results['Longitude'].round(4)

        evaluate_model(df_day_21, forecast_results)
        
        forecast_results.to_csv("forecast_day26.csv", index=False)
        print("💾 Đã lưu kết quả dự báo vào forecast_day26.csv")

        print("\n📊 Kết quả dự báo:")
        print(forecast_results.head())
        print("📈 Khoảng thời gian dự báo:",
              forecast_results['Datetime'].min(), "→", forecast_results['Datetime'].max())

        print("\n📊 Dữ liệu thực tế:")
        print(df_day_21.head())
        print("📈 Khoảng thời gian thực tế:",
              df_day_21['Datetime'].min(), "→", df_day_21['Datetime'].max())
    else:
        print("❌ Không có kết quả dự báo để đánh giá.")
        
            # Hiển thị kết quả dự báo và thực tế chi tiết hơn
        print("\n📋 Một vài dòng dữ liệu dự báo:")
        print(forecast_results[['Datetime', 'Latitude', 'Longitude'] + [col for col in forecast_results.columns if col.startswith('Predicted_')]].head(10))

        print("\n📋 Một vài dòng dữ liệu thực tế:")
        print(df_day_21[['Datetime', 'Latitude', 'Longitude', 'T2M', 'QV2M', 'PS', 'WS10M', 'PRECTOTCORR', 'CLRSKY_SFC_SW_DWN']].head(10))

    # Vẽ biểu đồ so sánh
    def plot_results_for_location(df_actual, df_pred, lat, lon):
        actual_data = df_actual[(df_actual['Latitude'] == lat) & (df_actual['Longitude'] == lon)]
        pred_data = df_pred[(df_pred['Latitude'] == lat) & (df_pred['Longitude'] == lon)]

        if actual_data.empty or pred_data.empty:
            print(f"Không có dữ liệu cho toạ độ ({lat}, {lon})")
            return

        cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"]
        plt.figure(figsize=(18, 12))
        for i, col in enumerate(cols):
            plt.subplot(3, 2, i+1)
            plt.plot(actual_data['Datetime'], actual_data[col], label='Thực tế', color='blue')
            plt.plot(pred_data['Datetime'], pred_data[f'Predicted_{col}'], label='Dự báo', linestyle='--', color='red')
            plt.title(f'{col} tại ({lat}, {lon})')
            plt.xlabel('Thời gian')
            plt.ylabel(col)
            plt.xticks(rotation=45)
            plt.grid(True)
            plt.legend()
        plt.tight_layout()
        plt.show()

    # Chọn một toạ độ phổ biến nhất để vẽ
    most_common_coord = forecast_results.groupby(['Latitude', 'Longitude']).size().idxmax()
    plot_results_for_location(df_day_21, forecast_results, *most_common_coord)


NameError: name 'reorder_and_clean_columns' is not defined

In [8]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import os
import gc
from datetime import datetime

# Configuration
scaler_file = "../dataset/processed/scaler.pkl"
train_file = "../dataset/real_weather_data.csv"
test_file = "../dataset/real_weather_data.csv"
checkpoint_path = "best_model_fine_tune.h5"

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"]

timesteps = 24  # Input sequence length
forecast_steps = 24  # Must match model's output length
batch_size = 256

def main():
    """Main execution function"""
    try:
        # Load scalers and model
        try:
            with open(scaler_file, 'rb') as f:
                scaler_X, scaler_y = pickle.load(f)
        except FileNotFoundError:
            raise FileNotFoundError(f"Scaler file not found at {scaler_file}. Please ensure the file exists.")

        try:
            model = load_model(checkpoint_path)
            model.summary()  # Verify input/output shapes
        except FileNotFoundError:
            raise FileNotFoundError(f"Model file not found at {checkpoint_path}. Please ensure the file exists.")
        except Exception as e:
            raise RuntimeError(f"Error loading model: {str(e)}")

        def debug_scalers(scaler_X, scaler_y):
            """Print scaler information for debugging"""
            print("\n=== DEBUG SCALERS ===")
            print("Scaler_X features:", scaler_X.feature_names_in_)
            print("Scaler_X min:", scaler_X.data_min_)
            print("Scaler_X max:", scaler_X.data_max_)
            print("\nScaler_y features:", scaler_y.feature_names_in_)
            print("Scaler_y min:", scaler_y.data_min_)
            print("Scaler_y max:", scaler_y.data_max_)

        def preprocess_data(df):
            """Clean and prepare raw data"""
            if 'PRECTOT' in df.columns and 'PRECTOTCORR' not in df.columns:
                df = df.rename(columns={'PRECTOT': 'PRECTOTCORR'})
            
            df['Datetime'] = pd.to_datetime(df['Datetime'])
            df['hour'] = df['Datetime'].dt.hour.astype(np.int8)
            df['day'] = df['Datetime'].dt.day.astype(np.int8)
            df['month'] = df['Datetime'].dt.month.astype(np.int8)
            df['season'] = ((df['month'] % 12 + 3) // 3).astype(np.int8)
            
            # Handle missing values and invalid numbers
            for col in feature_cols + target_cols:
                if col in df.columns:
                    if df[col].dtype == 'object':
                        df[col] = pd.to_numeric(df[col].astype(str).str.extract('([-+]?\d*\.?\d+)')[0], errors='coerce')
                    df[col] = df[col].replace(-999, np.nan)
                    df[col] = df[col].fillna(df[col].mean())
            
            return df

        # Debugging information
        debug_scalers(scaler_X, scaler_y)
        
        # Load and check data
        print("\nLoading training data...")
        try:
            train_df = pd.read_csv(train_file)
            train_df = preprocess_data(train_df)
        except FileNotFoundError:
            raise FileNotFoundError(f"Training file not found at {train_file}. Please ensure the file exists.")
        
        print("\nLoading test data...")
        try:
            test_df = pd.read_csv(test_file)
            test_df = preprocess_data(test_df)
        except FileNotFoundError:
            raise FileNotFoundError(f"Test file not found at {test_file}. Please ensure the file exists.")
        except Exception as e:
            raise RuntimeError(f"Error loading test data: {str(e)}")

        def debug_data_sample(df, name="Data"):
            """Print sample data for inspection"""
            print(f"\n=== DEBUG {name.upper()} SAMPLE ===")
            print(f"Shape: {df.shape}")
            print(df.head(3))
            print("\nData types:")
            print(df.dtypes)
            print("\nMissing values:")
            print(df.isna().sum())

        debug_data_sample(train_df, "Training Data")
        debug_data_sample(test_df, "Test Data")

        def create_sequences(data, seq_length, target_length):
            """Create input-output sequences from time series data"""
            X, y = [], []
            for i in range(len(data) - seq_length - target_length + 1):
                X.append(data[i:i+seq_length])
                y.append(data[i+seq_length:i+seq_length+target_length])
            return np.array(X), np.array(y)

        def evaluation_data_generator(file_path):
            """Generate batches of evaluation data"""
            dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
            dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})
            
            for chunk in pd.read_csv(file_path, chunksize=batch_size*5, dtype=dtype_dict, parse_dates=["Datetime"]):
                chunk = preprocess_data(chunk)
                if len(chunk) < timesteps + forecast_steps:
                    continue
                
                X_scaled = scaler_X.transform(chunk[feature_cols])
                y_scaled = scaler_y.transform(chunk[target_cols])
                
                # Create sequences with correct lengths
                X_batch, y_batch = create_sequences(X_scaled, timesteps, forecast_steps)
                y_target = create_sequences(y_scaled, timesteps, forecast_steps)[1]
                
                if len(X_batch) > 0:
                    yield X_batch, y_target

        def evaluate_model():
            """Evaluate model performance"""
            print("\n=== MODEL EVALUATION ===")
            
            # Create dataset with proper shapes
            test_dataset = tf.data.Dataset.from_generator(
                lambda: evaluation_data_generator(test_file),
                output_signature=(
                    tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),
                    tf.TensorSpec(shape=(None, forecast_steps, len(target_cols)), dtype=tf.float32)
                )
            ).prefetch(tf.data.AUTOTUNE)
            
            # Run evaluation
            print("\nEvaluating on test data...")
            evaluation_metrics = model.evaluate(test_dataset, verbose=1)
            
            if isinstance(evaluation_metrics, list):
                print("\nEvaluation metrics:")
                for metric_name, metric_value in zip(model.metrics_names, evaluation_metrics):
                    print(f"{metric_name}: {metric_value:.4f}")
            else:
                print(f"\nEvaluation loss: {evaluation_metrics:.4f}")
            
            # Sample predictions
            print("\nRunning sample predictions...")
            test_generator = evaluation_data_generator(test_file)
            X_sample, y_sample_true = next(test_generator)
            
            if X_sample.size > 0:
                y_sample_pred = model.predict(X_sample[:1], verbose=1)  # Predict first sample
                
                # Inverse transform scaling
                y_true = scaler_y.inverse_transform(y_sample_true[0].reshape(-1, len(target_cols)))
                y_pred = scaler_y.inverse_transform(y_sample_pred.reshape(-1, len(target_cols)))
                
                def debug_model_predictions(y_true, y_pred, target_names):
                    """Compare predictions with ground truth"""
                    print("\n=== DEBUG PREDICTIONS ===")
                    for i, col in enumerate(target_names):
                        print(f"\n{col}:")
                        print(f"- True values (sample): {y_true[:3,i]}")
                        print(f"- Pred values (sample): {y_pred[:3,i]}")
                        print(f"- MAE: {np.mean(np.abs(y_true[:,i] - y_pred[:,i])):.4f}")
                        print(f"- RMSE: {np.sqrt(np.mean((y_true[:,i] - y_pred[:,i])**2)):.4f}")
                
                debug_model_predictions(y_true, y_pred, target_cols)
            else:
                print("Warning: No valid samples found for prediction demonstration")

        evaluate_model()

        def make_forecast(model, input_data):
            """Generate future predictions"""
            print("\n=== FORECASTING ===")
            input_data = preprocess_data(input_data)
            
            if len(input_data) < timesteps:
                raise ValueError(f"Need at least {timesteps} timesteps, got {len(input_data)}")
            
            # Prepare input sequence
            X_df = input_data[feature_cols].iloc[-timesteps:]
            X_scaled = scaler_X.transform(X_df)
            X_scaled = X_scaled.reshape(1, timesteps, len(feature_cols))
            
            # Make prediction
            print("\nMaking prediction...")
            y_pred_scaled = model.predict(X_scaled, verbose=1)
            y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, len(target_cols)))
            
            # Generate forecast dates
            last_date = input_data['Datetime'].iloc[-1]
            forecast_dates = pd.date_range(
                start=last_date + pd.Timedelta(hours=1),
                periods=forecast_steps,
                freq='h'
            )
            
            # Create result DataFrame
            result = pd.DataFrame({
                'Datetime': forecast_dates,
                **{f'Predicted_{col}': y_pred[:, i] for i, col in enumerate(target_cols)}
            })
            
            print("\nForecast summary:")
            print(result.describe())
            
            return result

        # Make forecast
        print("\nGenerating forecast...")
        forecast_df = make_forecast(model, train_df)
        
        # Save results
        output_dir = "results"
        os.makedirs(output_dir, exist_ok=True)
        forecast_file = os.path.join(output_dir, f"forecast_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
        forecast_df.to_csv(forecast_file, index=False)
        print(f"\nForecast saved to {forecast_file}")
        
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    print("Starting weather forecasting pipeline...")
    main()
    print("\nPipeline completed!")

Starting weather forecasting pipeline...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 1

  numdigits = int(np.log10(self.target)) + 1
Traceback (most recent call last):
  File "C:\Users\tranh\AppData\Local\Temp\ipykernel_24376\2277396659.py", line 186, in main
    evaluate_model()
  File "C:\Users\tranh\AppData\Local\Temp\ipykernel_24376\2277396659.py", line 151, in evaluate_model
    evaluation_metrics = model.evaluate(test_dataset, verbose=1)
  File "d:\anaconda3\envs\weather_lstm\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "d:\anaconda3\envs\weather_lstm\lib\site-packages\keras\utils\generic_utils.py", line 993, in update
    numdigits = int(np.log10(self.target)) + 1
OverflowError: cannot convert float infinity to integer


In [9]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

# Đường dẫn tới scaler và mô hình đã lưu
scaler_file = "../dataset/processed/scaler.pkl"
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

# Các cột đặc trưng và mục tiêu
feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

# Thời gian chuỗi
timesteps = 24
batch_size = 256

# Đường dẫn tới file test và mô hình đã huấn luyện
test_file = "../dataset/real_weather_data.csv"
checkpoint_path = "best_model_fine_tune.h5"
model = load_model(checkpoint_path)

# Hiển thị mô hình
print("Model: \"sequential\"")
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        # Thêm các cột thời gian
        chunk = add_time_features(chunk)  # Gọi hàm để thêm các cột hour, day, month, season
        
        # Đảm bảo cột PRECTOTCORR có mặt
        chunk = reorder_and_clean_columns(chunk)  # Đảm bảo PRECTOTCORR được thêm vào
        
        # Tiến hành chuẩn hóa dữ liệu
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

# Tạo dataset từ generator
test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),  # Sửa: Sử dụng None thay cho batch_size
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),  # Sửa: Sử dụng None thay cho batch_size
    )
).prefetch(tf.data.AUTOTUNE)

# Kiểm tra nếu có dữ liệu hợp lệ
try:
    for X_batch, y_batch in data_generator(test_file, feature_cols, target_cols, batch_size, timesteps):
        print(f"X_batch shape: {X_batch.shape}, y_batch shape: {y_batch.shape}")
        break  # Chỉ kiểm tra một batch
except Exception as e:
    print(f"Error during data generation: {e}")

# Đánh giá mô hình trên tập test
try:
    print("\nĐang đánh giá mô hình trên tập test...")
    loss, mae = model.evaluate(test_dataset, verbose=1)
    print(f"✅ Kết quả đánh giá trên dữ liệu test: Loss={loss:.4f}, MAE={mae:.4f}")
except Exception as e:
    print(f"Error during model evaluation: {e}")


Model: "sequential"
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable para