Evaluate model epoch 4 no embedding

# LSTM model

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

scaler_file = "../dataset/processed/scaler.pkl"

with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]

target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

timesteps = 24
batch_size = 256

test_file = "../dataset/processed/test_data.csv"

checkpoint_path = "../model/best_model2.h5"
model = load_model(checkpoint_path)
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        chunk = chunk.sort_values(by=["Datetime"])
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),
    )
).prefetch(tf.data.AUTOTUNE)

loss, mae = model.evaluate(test_dataset)
print(f"‚úÖ K·∫øt qu·∫£ ƒë√°nh gi√° tr√™n d·ªØ li·ªáu test: Loss={loss:.4f}, MAE={mae:.4f}")


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable params: 130,960
Non-trai

# Fine-tune & CNN-LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

scaler_file = "../dataset/processed/scaler.pkl"
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

timesteps = 24
batch_size = 256

test_file = "../dataset/processed/test_data.csv"
checkpoint_path = "best_model_spatial.h5"
model = load_model(checkpoint_path)

print("Model: \"sequential\"")
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        chunk = chunk.sort_values(by=["Datetime", "Latitude", "Longitude"])
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),
    )
).prefetch(tf.data.AUTOTUNE)

print("\nƒêang ƒë√°nh gi√° m√¥ h√¨nh tr√™n t·∫≠p test...")
loss, mae = model.evaluate(test_dataset, verbose=1)
print(f"‚úÖ K·∫øt qu·∫£ ƒë√°nh gi√° tr√™n d·ªØ li·ªáu test: Loss={loss:.4f}, MAE={mae:.4f}")

Model: "sequential"
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable para

# Fine-tune LSTM

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

scaler_file = "../dataset/processed/scaler.pkl"
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

timesteps = 24
batch_size = 256

test_file = "../dataset/processed/test_data.csv"
checkpoint_path = "best_model_spatial.h5"
model = load_model(checkpoint_path)

print("Model: \"sequential\"")
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        chunk = chunk.sort_values(by=["Datetime", "Latitude", "Longitude"])
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32), 
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),
    )
).prefetch(tf.data.AUTOTUNE)

print("\nƒêang ƒë√°nh gi√° m√¥ h√¨nh tr√™n t·∫≠p test...")
loss, mae = model.evaluate(test_dataset, verbose=1)
print(f"‚úÖ K·∫øt qu·∫£ ƒë√°nh gi√° tr√™n d·ªØ li·ªáu test: Loss={loss:.4f}, MAE={mae:.4f}")

Model: "sequential"
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable para

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# ========================== LOAD & CLEAN DATA ==========================

def load_processed_data():
    df_day_20 = pd.read_csv('../dataset/weather_data_25.csv')
    df_day_21 = pd.read_csv('../dataset/weather_data_26.csv')
    return df_day_20, df_day_21

def add_time_features(df):
    df['Datetime'] = pd.to_datetime(df['Datetime'], errors='coerce')
    df['hour'] = df['Datetime'].dt.hour
    df['day'] = df['Datetime'].dt.day
    df['month'] = df['Datetime'].dt.month
    df['season'] = (df['month'] % 12 + 3) // 3
    return df

# ========================== LSTM INPUT ==========================

def create_lstm_input(df, time_steps=24):
    X, y = [], []
    features = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]

    for i in range(len(df) - time_steps - 24 + 1):
        X_window = df.iloc[i:i+time_steps][features].values
        y_window = df.iloc[i+time_steps:i+time_steps+24][features[:6]].values
        if y_window.shape[0] == 24:
            X.append(X_window)
            y.append(y_window)
    return np.array(X), np.array(y)

# ========================== FORECAST ==========================

def predict_with_model_for_lat_lon(df_day_20, lat, lon, scaler_X, scaler_y, model):
    df_lat_lon = df_day_20[(df_day_20['Latitude'] == lat) & (df_day_20['Longitude'] == lon)].copy()
    if len(df_lat_lon) == 0:
        print(f"No data for lat {lat}, lon {lon}")
        return None, None

    # Th√™m c·ªôt th·ªùi gian
    df_lat_lon = add_time_features(df_lat_lon)
    
    # ƒê·∫£m b·∫£o c·ªôt PRECTOTCORR c√≥ m·∫∑t
    df_lat_lon = reorder_and_clean_columns(df_lat_lon)
    
    X_input, _ = create_lstm_input(df_lat_lon, time_steps=24)
    if X_input.shape[0] == 0:
        print(f"Not enough data for prediction at lat {lat}, lon {lon}")
        return None, None

    features = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]

    X_input_df = pd.DataFrame(X_input.reshape(-1, len(features)), columns=features)
    X_input_df = X_input_df[scaler_X.feature_names_in_]
    X_input_scaled = scaler_X.transform(X_input_df).reshape(X_input.shape)

    y_pred_scaled = model.predict(X_input_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled[-1])  # 24 d√≤ng cu·ªëi

    forecast_start = df_lat_lon['Datetime'].iloc[-1] + pd.Timedelta(hours=1)
    forecast_times = pd.date_range(start=forecast_start, periods=24, freq='h')

    forecast_df = pd.DataFrame(y_pred, columns=[
        'Predicted_T2M', 'Predicted_QV2M', 'Predicted_PS', 'Predicted_WS10M',
        'Predicted_PRECTOTCORR', 'Predicted_CLRSKY_SFC_SW_DWN'
    ])
    forecast_df['Datetime'] = forecast_times
    forecast_df['Latitude'] = lat
    forecast_df['Longitude'] = lon

    return forecast_df, y_pred

def forecast_for_all_lat_lon(df_day_20, scaler_X, scaler_y):
    model = load_model('best_model_fine_tune.h5')
    available_coords = df_day_20[['Latitude', 'Longitude']].drop_duplicates().values
    forecast_results_list = []

    for lat, lon in available_coords:
        forecast_df, _ = predict_with_model_for_lat_lon(df_day_20, lat, lon, scaler_X, scaler_y, model)
        if forecast_df is not None:
            forecast_results_list.append(forecast_df)

    forecast_results = pd.concat(forecast_results_list, ignore_index=True)
    return forecast_results

# ========================== EVALUATE ==========================

def evaluate_model(df_day_21, forecast_results):
    df_day_21['Datetime'] = pd.to_datetime(df_day_21['Datetime'], errors='coerce')
    forecast_results['Datetime'] = pd.to_datetime(forecast_results['Datetime'], errors='coerce')

    df_day_21['Latitude'] = df_day_21['Latitude'].astype(np.float32)
    df_day_21['Longitude'] = df_day_21['Longitude'].astype(np.float32)
    forecast_results['Latitude'] = forecast_results['Latitude'].astype(np.float32)
    forecast_results['Longitude'] = forecast_results['Longitude'].astype(np.float32)

    merged = pd.merge(df_day_21, forecast_results,
                      on=['Datetime', 'Latitude', 'Longitude'],
                      how='inner', suffixes=('', '_pred'))

    if len(merged) == 0:
        print("No matching data for evaluation")
        return

    print(f"\n‚úÖ S·ªë l∆∞·ª£ng d√≤ng ƒë∆∞·ª£c so s√°nh: {len(merged)}")
    print(merged[['Datetime', 'Latitude', 'Longitude'] + [f'Predicted_{c}' for c in ['T2M', 'QV2M']]].head())

    metrics = {}
    for col in ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"]:  # S·ª≠a t√™n c·ªôt ·ªü ƒë√¢y
        actual = merged[col]
        pred = merged[f'Predicted_{col}']
        metrics[col] = {
            'MAE': mean_absolute_error(actual, pred),
            'MSE': mean_squared_error(actual, pred),
            'R2': r2_score(actual, pred)
        }

    print("\nüîç Chi ti·∫øt c√°c ch·ªâ s·ªë ƒë√°nh gi√°:\n")
    df_metrics = pd.DataFrame(metrics).T
    df_metrics.columns = ['MAE', 'MSE', 'R2']
    print(df_metrics.round(4))

# ========================== MAIN ==========================

if __name__ == "__main__":
    df_day_20, df_day_21 = load_processed_data()
    df_day_20_unscaled = df_day_20.copy()

    with open('../dataset/scaler.pkl', 'rb') as f:
        scaler_X, scaler_y = pickle.load(f)

    forecast_results = forecast_for_all_lat_lon(df_day_20_unscaled, scaler_X, scaler_y)

    if len(forecast_results) > 0:
        forecast_results['Latitude'] = forecast_results['Latitude'].round(4)
        forecast_results['Longitude'] = forecast_results['Longitude'].round(4)

        evaluate_model(df_day_21, forecast_results)
        
        forecast_results.to_csv("forecast_day26.csv", index=False)
        print("üíæ ƒê√£ l∆∞u k·∫øt qu·∫£ d·ª± b√°o v√†o forecast_day26.csv")

        print("\nüìä K·∫øt qu·∫£ d·ª± b√°o:")
        print(forecast_results.head())
        print("üìà Kho·∫£ng th·ªùi gian d·ª± b√°o:",
              forecast_results['Datetime'].min(), "‚Üí", forecast_results['Datetime'].max())

        print("\nüìä D·ªØ li·ªáu th·ª±c t·∫ø:")
        print(df_day_21.head())
        print("üìà Kho·∫£ng th·ªùi gian th·ª±c t·∫ø:",
              df_day_21['Datetime'].min(), "‚Üí", df_day_21['Datetime'].max())
    else:
        print("‚ùå Kh√¥ng c√≥ k·∫øt qu·∫£ d·ª± b√°o ƒë·ªÉ ƒë√°nh gi√°.")
        
            # Hi·ªÉn th·ªã k·∫øt qu·∫£ d·ª± b√°o v√† th·ª±c t·∫ø chi ti·∫øt h∆°n
        print("\nüìã M·ªôt v√†i d√≤ng d·ªØ li·ªáu d·ª± b√°o:")
        print(forecast_results[['Datetime', 'Latitude', 'Longitude'] + [col for col in forecast_results.columns if col.startswith('Predicted_')]].head(10))

        print("\nüìã M·ªôt v√†i d√≤ng d·ªØ li·ªáu th·ª±c t·∫ø:")
        print(df_day_21[['Datetime', 'Latitude', 'Longitude', 'T2M', 'QV2M', 'PS', 'WS10M', 'PRECTOTCORR', 'CLRSKY_SFC_SW_DWN']].head(10))

    # V·∫Ω bi·ªÉu ƒë·ªì so s√°nh
    def plot_results_for_location(df_actual, df_pred, lat, lon):
        actual_data = df_actual[(df_actual['Latitude'] == lat) & (df_actual['Longitude'] == lon)]
        pred_data = df_pred[(df_pred['Latitude'] == lat) & (df_pred['Longitude'] == lon)]

        if actual_data.empty or pred_data.empty:
            print(f"Kh√¥ng c√≥ d·ªØ li·ªáu cho to·∫° ƒë·ªô ({lat}, {lon})")
            return

        cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"]
        plt.figure(figsize=(18, 12))
        for i, col in enumerate(cols):
            plt.subplot(3, 2, i+1)
            plt.plot(actual_data['Datetime'], actual_data[col], label='Th·ª±c t·∫ø', color='blue')
            plt.plot(pred_data['Datetime'], pred_data[f'Predicted_{col}'], label='D·ª± b√°o', linestyle='--', color='red')
            plt.title(f'{col} t·∫°i ({lat}, {lon})')
            plt.xlabel('Th·ªùi gian')
            plt.ylabel(col)
            plt.xticks(rotation=45)
            plt.grid(True)
            plt.legend()
        plt.tight_layout()
        plt.show()

    # Ch·ªçn m·ªôt to·∫° ƒë·ªô ph·ªï bi·∫øn nh·∫•t ƒë·ªÉ v·∫Ω
    most_common_coord = forecast_results.groupby(['Latitude', 'Longitude']).size().idxmax()
    plot_results_for_location(df_day_21, forecast_results, *most_common_coord)



‚úÖ S·ªë l∆∞·ª£ng d√≤ng ƒë∆∞·ª£c so s√°nh: 166464
    Datetime  Latitude  Longitude  Predicted_T2M  Predicted_QV2M
0 2024-07-26       8.0      102.0     480.932922       68.425316
1 2024-07-26       8.0      102.0     480.932922       68.425316
2 2024-07-26       8.0      102.0     480.932922       68.425316
3 2024-07-26       8.0      102.0     480.932922       68.425316
4 2024-07-26       8.0      102.0     480.932922       68.425316


KeyError: 'PRECTOTCORR'

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# 1. C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n
scaler_file = "../dataset/processed/scaler.pkl"
train_file = "../dataset/weather_data_25.csv"  # D√πng ƒë·ªÉ d·ª± b√°o
test_file = "../dataset/weather_data_26.csv"   # D√πng ƒë·ªÉ ƒë√°nh gi√°
checkpoint_path = "best_model_fine_tune.h5"

# 2. C√°c c·ªôt ƒë·∫∑c tr∆∞ng v√† m·ª•c ti√™u
feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
               "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"]

timesteps = 24
batch_size = 256

# 3. T·∫£i model v√† scaler
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

model = load_model(checkpoint_path)

# 4. H√†m ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu (c·∫£i ti·∫øn)
def preprocess_data(df):
    # Chu·∫©n h√≥a t√™n c·ªôt
    if 'PRECTOT' in df.columns and 'PRECTOTCORR' not in df.columns:
        df = df.rename(columns={'PRECTOT': 'PRECTOTCORR'})
    
    # Th√™m c√°c ƒë·∫∑c tr∆∞ng th·ªùi gian
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df['hour'] = df['Datetime'].dt.hour.astype(np.int8)
    df['day'] = df['Datetime'].dt.day.astype(np.int8)
    df['month'] = df['Datetime'].dt.month.astype(np.int8)
    df['season'] = ((df['month'] % 12 + 3) // 3).astype(np.int8)
    
    # X·ª≠ l√Ω d·ªØ li·ªáu s·ªë
    for col in feature_cols + target_cols:
        if col in df.columns:
            if df[col].dtype == 'object':
                df[col] = pd.to_numeric(df[col].astype(str).str.extract('([-+]?\d*\.?\d+)')[0], errors='coerce')
            df[col] = df[col].fillna(df[col].mean())
    return df

def evaluation_data_generator(file_path):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size*5, dtype=dtype_dict, parse_dates=["Datetime"]):
        chunk = preprocess_data(chunk)
        
        # Ki·ªÉm tra n·∫øu chunk ƒë·ªß d√†i
        if len(chunk) < timesteps + 24:
            print(f"Warning: Chunk size {len(chunk)} too small for timesteps={timesteps} and forecast=24. Skipping...")
            continue
        
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])
        
        if X_batch:  # Ch·ªâ yield n·∫øu c√≥ d·ªØ li·ªáu
            yield np.array(X_batch), np.array(y_batch)
        else:
            print("Warning: No valid sequences generated from this chunk.")

def debug_scalers(scaler_X, scaler_y):
    print("\n=== DEBUG SCALERS ===")
    print("Scaler_X details:")
    print(f"- Feature names: {scaler_X.feature_names_in_}")
    print(f"- Data min: {scaler_X.data_min_}")
    print(f"- Data max: {scaler_X.data_max_}")
    print(f"- Scale: {scaler_X.scale_}")
    print(f"- Min: {scaler_X.min_}")
    
    print("\nScaler_y details:")
    print(f"- Feature names: {scaler_y.feature_names_in_}")
    print(f"- Data min: {scaler_y.data_min_}")
    print(f"- Data max: {scaler_y.data_max_}")
    print(f"- Scale: {scaler_y.scale_}")
    print(f"- Min: {scaler_y.min_}")

def debug_data_sample(df, name="Data"):
    print(f"\n=== DEBUG {name.upper()} SAMPLE ===")
    print(f"Shape: {df.shape}")
    print("\nFirst 3 rows:")
    print(df.head(3))
    print("\nLast 3 rows:")
    print(df.tail(3))
    print("\nDescriptive stats:")
    print(df.describe())
    print("\nMissing values:")
    print(df.isnull().sum())

def debug_model_predictions(y_true, y_pred, target_names):
    print("\n=== DEBUG PREDICTIONS ===")
    for i, col in enumerate(target_names):
        print(f"\n{col}:")
        print(f"- True mean: {y_true[:,i].mean():.4f}")
        print(f"- Pred mean: {y_pred[:,i].mean():.4f}")
        print(f"- MAE: {np.mean(np.abs(y_true[:,i] - y_pred[:,i])):.4f}")
        print(f"- Max Error: {np.max(np.abs(y_true[:,i] - y_pred[:,i])):.4f}")
        
# 6. H√†m d·ª± b√°o (t·ª´ c√°ch th·ª© 1 nh∆∞ng ƒë√£ c·∫£i ti·∫øn)
def make_forecast(model, input_data):
    print("\n=== FORECAST DEBUG START ===")
    
    # Ti·ªÅn x·ª≠ l√Ω
    input_data = preprocess_data(input_data)
    debug_data_sample(input_data[feature_cols + target_cols], "Input Data")
    
    # L·∫•y d·ªØ li·ªáu cu·ªëi c√πng
    X_df = input_data[feature_cols].iloc[-timesteps:]
    print(f"\nUsing last {timesteps} timesteps for prediction:")
    print(X_df)
    
    # Chu·∫©n h√≥a
    print("\nScaling details:")
    print("First row before scaling:", X_df.iloc[0].values)
    X_scaled = scaler_X.transform(X_df)
    print("First row after scaling:", X_scaled[0])
    
    X_scaled = X_scaled.reshape(1, timesteps, len(feature_cols))
    
    # D·ª± b√°o
    print("\nMaking prediction...")
    y_pred_scaled = model.predict(X_scaled, verbose=1)
    print("\nRaw scaled predictions:")
    print(y_pred_scaled)
    
    # Inverse transform
    print("\nInverse transforming predictions...")
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, len(target_cols)))
    print("First 5 predicted values after inverse transform:")
    print(y_pred[:5])
    
    # T·∫°o DataFrame k·∫øt qu·∫£
    forecast_dates = pd.date_range(
        start=input_data['Datetime'].iloc[-1] + pd.Timedelta(hours=1),
        periods=24,
        freq='h'
    )
    
    print("\n=== FORECAST DEBUG END ===")
    return pd.DataFrame({
        'Datetime': forecast_dates,
        **{f'Predicted_{col}': y_pred[:, i] for i, col in enumerate(target_cols)}
    })

# 7. ƒê√°nh gi√° m√¥ h√¨nh
def evaluate_model():
    # T·∫°o dataset
    test_dataset = tf.data.Dataset.from_generator(
        lambda: evaluation_data_generator(test_file),
        output_signature=(
            tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32)
        )
    ).prefetch(tf.data.AUTOTUNE)

    # ƒê√°nh gi√°
    print("\nEvaluating model...")
    loss, mae = model.evaluate(test_dataset, verbose=1)
    print(f"Test Loss: {loss:.4f}, Test MAE: {mae:.4f}")

    # D·ª± b√°o m·∫´u ƒë·ªÉ debug
    print("\nRunning sample predictions for debugging...")
    for X_batch, y_batch in evaluation_data_generator(test_file):
        # L·∫•y batch ƒë·∫ßu ti√™n
        X_sample = X_batch[0:1]  # L·∫•y m·∫´u ƒë·∫ßu ti√™n
        y_true = y_batch[0:1]   # Gi√° tr·ªã th·ª±c t·∫ø t∆∞∆°ng ·ª©ng
        
        # D·ª± b√°o
        y_pred_scaled = model.predict(X_sample, verbose=0)
        
        # Inverse transform
        y_true = scaler_y.inverse_transform(y_true.reshape(-1, len(target_cols)))
        y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, len(target_cols)))
        
        # Debug
        debug_model_predictions(y_true, y_pred, target_cols)
        break  # Ch·ªâ ki·ªÉm tra batch ƒë·∫ßu ti√™n

# 8. So s√°nh d·ª± b√°o v·ªõi th·ª±c t·∫ø
def compare_results(forecast_df):
    # ƒê·ªçc v√† ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu th·ª±c t·∫ø
    actual_df = preprocess_data(pd.read_csv(test_file))
    actual_24h = actual_df.iloc[:24].copy()
    
    # Gh√©p d·ªØ li·ªáu
    comparison = pd.merge(
        forecast_df.rename(columns={f'Predicted_{col}': col for col in target_cols}),
        actual_24h[['Datetime'] + target_cols],
        on='Datetime',
        suffixes=('_pred', '_actual')
    )
    
    # T√≠nh to√°n sai s·ªë
    for col in target_cols:
        comparison[f'{col}_error'] = comparison[f'{col}_actual'] - comparison[f'{col}_pred']
    
    # Visualize
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(target_cols, 1):
        plt.subplot(3, 2, i)
        plt.plot(comparison['Datetime'], comparison[f'{col}_actual'], label='Th·ª±c t·∫ø')
        plt.plot(comparison['Datetime'], comparison[f'{col}_pred'], label='D·ª± b√°o')
        plt.title(col)
        plt.legend()
        plt.grid()
    plt.tight_layout()
    plt.show()
    
    return comparison

# H√†m check_data_distribution s·ª≠a l·∫°i
def check_data_distribution():
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    
    # Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu tr∆∞·ªõc khi ki·ªÉm tra
    train_df = preprocess_data(train_df)
    test_df = preprocess_data(test_df)
    
    print("\nPh√¢n ph·ªëi d·ªØ li·ªáu train vs test:")
    for col in feature_cols + target_cols:
        if col in train_df.columns and col in test_df.columns:
            plt.figure(figsize=(10, 4))
            plt.hist(train_df[col], bins=30, alpha=0.5, label='Train')
            plt.hist(test_df[col], bins=30, alpha=0.5, label='Test')
            plt.title(col)
            plt.legend()
            plt.show()
        else:
            print(f"Warning: Column {col} not found in data")

# H√†m main s·ª≠a l·∫°i
def main():
    try:
        # Ki·ªÉm tra scaler chi ti·∫øt
        debug_scalers(scaler_X, scaler_y)
        
        # Ki·ªÉm tra d·ªØ li·ªáu train/test
        print("\nChecking training data...")
        train_df = pd.read_csv(train_file)
        train_df = preprocess_data(train_df)
        debug_data_sample(train_df, "Training Data")
        
        print("\nChecking test data...")
        test_df = pd.read_csv(test_file)
        test_df = preprocess_data(test_df)
        debug_data_sample(test_df, "Test Data")
        
        # ƒê√°nh gi√° model v·ªõi debug
        evaluate_model()
        
        # D·ª± b√°o v·ªõi debug chi ti·∫øt
        print("\nMaking forecast with debug...")
        forecast_df = make_forecast(model, train_df)
        
        if forecast_df is not None:
            print("\nForecast results:")
            print(forecast_df)
            
            # So s√°nh v·ªõi th·ª±c t·∫ø
            comparison_df = compare_results(forecast_df)
            
            # T√≠nh to√°n sai s·ªë chi ti·∫øt
            print("\nError Analysis:")
            for col in target_cols:
                if f'{col}_error' in comparison_df.columns:
                    errors = comparison_df[f'{col}_error']
                    print(f"\n{col}:")
                    print(f"- Mean: {errors.mean():.4f}")
                    print(f"- MAE: {errors.abs().mean():.4f}")
                    print(f"- Max Error: {errors.abs().max():.4f}")
                    print(f"- Std Dev: {errors.std():.4f}")
                    print(f"- Median: {errors.median():.4f}")
                    
                    # Ph√¢n ph·ªëi sai s·ªë
                    plt.figure()
                    plt.hist(errors, bins=30)
                    plt.title(f'Error Distribution - {col}')
                    plt.show()
    
    except Exception as e:
        print(f"\n‚ùå Main Error: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


=== DEBUG SCALERS ===
Scaler_X details:
- Feature names: ['Latitude' 'Longitude' 'hour' 'day' 'month' 'season' 'WS10M' 'QV2M' 'PS'
 'PRECTOTCORR' 'T2M' 'CLRSKY_SFC_SW_DWN']
- Data min: [   8.  102.    0.    1.    1.    1. -999. -999. -999.    0. -999. -999.]
- Data max: [  24.    118.     23.     31.     12.      4.   1072.65 1063.78 1031.07
 2178.57 1087.62 1073.12]
- Scale: [0.0625     0.0625     0.04347826 0.03333334 0.09090909 0.33333334
 0.00048271 0.00048478 0.00049259 0.00045902 0.00047924 0.0004826 ]
- Min: [-0.5        -6.375       0.         -0.03333334 -0.09090909 -0.33333334
  0.48222435  0.4842979   0.49210125  0.          0.47876468  0.4821149 ]

Scaler_y details:
- Feature names: ['CLRSKY_SFC_SW_DWN' 'PS' 'T2M' 'QV2M' 'WS10M' 'PRECTOTCORR']
- Data min: [-999. -999. -999. -999. -999.    0.]
- Data max: [1073.12 1031.07 1087.62 1063.78 1072.65 2178.57]
- Scale: [0.0004826  0.00049259 0.00047924 0.00048478 0.00048271 0.00045902]
- Min: [0.4821149  0.49210125 0.47876468 0.4

  numdigits = int(np.log10(self.target)) + 1
Traceback (most recent call last):
  File "C:\Users\tranh\AppData\Local\Temp\ipykernel_24376\458602434.py", line 263, in main
    evaluate_model()
  File "C:\Users\tranh\AppData\Local\Temp\ipykernel_24376\458602434.py", line 171, in evaluate_model
    loss, mae = model.evaluate(test_dataset, verbose=1)
  File "d:\anaconda3\envs\weather_lstm\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "d:\anaconda3\envs\weather_lstm\lib\site-packages\keras\utils\generic_utils.py", line 993, in update
    numdigits = int(np.log10(self.target)) + 1
OverflowError: cannot convert float infinity to integer


In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import pickle

# ƒê∆∞·ªùng d·∫´n t·ªõi scaler v√† m√¥ h√¨nh ƒë√£ l∆∞u
scaler_file = "../dataset/processed/scaler.pkl"
with open(scaler_file, 'rb') as f:
    scaler_X, scaler_y = pickle.load(f)

# C√°c c·ªôt ƒë·∫∑c tr∆∞ng v√† m·ª•c ti√™u
feature_cols = ["Latitude", "Longitude", "hour", "day", "month", "season", 
                "WS10M", "QV2M", "PS", "PRECTOTCORR", "T2M", "CLRSKY_SFC_SW_DWN"]
target_cols = ["CLRSKY_SFC_SW_DWN", "PS", "T2M", "QV2M", "WS10M", "PRECTOTCORR"] 

# Th·ªùi gian chu·ªói
timesteps = 24
batch_size = 256

# ƒê∆∞·ªùng d·∫´n t·ªõi file test v√† m√¥ h√¨nh ƒë√£ hu·∫•n luy·ªán
test_file = "../dataset/weather_data_26.csv"
checkpoint_path = "best_model_fine_tune.h5"
model = load_model(checkpoint_path)

# Hi·ªÉn th·ªã m√¥ h√¨nh
print("Model: \"sequential\"")
model.summary()

def data_generator(file_path, feature_cols, target_cols, batch_size=256, timesteps=24):
    dtype_dict = {col: np.float32 for col in feature_cols + target_cols}
    dtype_dict.update({"hour": np.int8, "day": np.int8, "month": np.int8, "season": np.int8})

    for chunk in pd.read_csv(file_path, chunksize=batch_size * 5, dtype=dtype_dict, parse_dates=["Datetime"], low_memory=True):
        # Th√™m c√°c c·ªôt th·ªùi gian
        chunk = add_time_features(chunk)  # G·ªçi h√†m ƒë·ªÉ th√™m c√°c c·ªôt hour, day, month, season
        
        # ƒê·∫£m b·∫£o c·ªôt PRECTOTCORR c√≥ m·∫∑t
        chunk = reorder_and_clean_columns(chunk)  # ƒê·∫£m b·∫£o PRECTOTCORR ƒë∆∞·ª£c th√™m v√†o
        
        # Ti·∫øn h√†nh chu·∫©n h√≥a d·ªØ li·ªáu
        X_scaled = scaler_X.transform(chunk[feature_cols])
        y_scaled = scaler_y.transform(chunk[target_cols])

        X_batch, y_batch = [], []
        for i in range(len(chunk) - timesteps - 24):
            X_batch.append(X_scaled[i:i+timesteps])
            y_batch.append(y_scaled[i+timesteps:i+timesteps+24])

        yield np.array(X_batch), np.array(y_batch)

# T·∫°o dataset t·ª´ generator
test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(test_file, feature_cols, target_cols, batch_size, timesteps),
    output_signature=(
        tf.TensorSpec(shape=(None, timesteps, len(feature_cols)), dtype=tf.float32),  # S·ª≠a: S·ª≠ d·ª•ng None thay cho batch_size
        tf.TensorSpec(shape=(None, 24, len(target_cols)), dtype=tf.float32),  # S·ª≠a: S·ª≠ d·ª•ng None thay cho batch_size
    )
).prefetch(tf.data.AUTOTUNE)

# Ki·ªÉm tra n·∫øu c√≥ d·ªØ li·ªáu h·ª£p l·ªá
try:
    for X_batch, y_batch in data_generator(test_file, feature_cols, target_cols, batch_size, timesteps):
        print(f"X_batch shape: {X_batch.shape}, y_batch shape: {y_batch.shape}")
        break  # Ch·ªâ ki·ªÉm tra m·ªôt batch
except Exception as e:
    print(f"Error during data generation: {e}")

# ƒê√°nh gi√° m√¥ h√¨nh tr√™n t·∫≠p test
try:
    print("\nƒêang ƒë√°nh gi√° m√¥ h√¨nh tr√™n t·∫≠p test...")
    loss, mae = model.evaluate(test_dataset, verbose=1)
    print(f"‚úÖ K·∫øt qu·∫£ ƒë√°nh gi√° tr√™n d·ªØ li·ªáu test: Loss={loss:.4f}, MAE={mae:.4f}")
except Exception as e:
    print(f"Error during model evaluation: {e}")


Model: "sequential"
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 24, 128)           72192     
                                                                 
 dropout (Dropout)           (None, 24, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 144)               9360      
                                                                 
 reshape (Reshape)           (None, 24, 6)             0         
                                                                 
Total params: 130,960
Trainable para

Traceback (most recent call last):
  File "C:\Users\tranh\AppData\Local\Temp\ipykernel_24060\2475724206.py", line 95, in <module>
    print("Sample X values:", X_batch[0, 0, :5])  # In 5 gi√° tr·ªã ƒë·∫ßu ti√™n c·ªßa m·∫´u ƒë·∫ßu ti√™n
IndexError: too many indices for array: array is 1-dimensional, but 3 were indexed
