In [242]:
import os
import re
from pathlib import Path
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

In [243]:
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

In [244]:
def is_gzip_file(filepath):
    with open(filepath, 'rb') as f:
        return f.read(2) == b'\x1f\x8b'
def display_all(df):
    plot_cols = df.columns
    plot_features = df[plot_cols]
    plot_features.index = df.index
    _ = plot_features.plot(subplots=True)

    plot_features = df[plot_cols][:960]
    plot_features.index = df.index[:960]
    _ = plot_features.plot(subplots=True)

In [245]:
csv_path = tf.keras.utils.get_file(
    origin='https://itsci.mju.ac.th/downloads/watcharin/datasets/pv/cmmju_15min.csv.tar.gz')
    # origin='https://itsci.mju.ac.th/downloads/watcharin/datasets/pv/era5-hourly_timeseries.csv.tar.gz')
    # origin='https://itsci.mju.ac.th/downloads/watcharin/datasets/pv/era5-land_timeseries.csv.tar.gz')
csv_path

'C:\\Users\\ASUS\\.keras\\datasets\\cmmju_15min.csv.tar.gz'

In [246]:
if is_gzip_file(csv_path):
  # Read the data directly into a pandas DataFrame
  df = pd.read_csv(csv_path, compression='gzip')
else:
  # Read the data without compression
  df = pd.read_csv(csv_path)
df.head(5)

Unnamed: 0,cmmju_15min.csv,ambient_temp,current_power,value_of_consumption,external_energy_supply,grid_feed_in,internal_power_supply,self_consumption,module_temp,total_irradiation,...,sp,sshf,ssr,ssrd,str,strd,t2m,tp,u10,v10
0,2021-11-06 11:15:00,30.288889,46519.355556,37280.422222,0.0,9238.933333,37280.422222,37280.422222,58.022222,798.444444,...,,,,,,,,,,
1,2021-11-06 11:30:00,30.076923,47600.714286,39472.626374,0.0,9397.756098,39132.406593,39132.406593,58.263736,812.571429,...,,,,,,,,,,
2,2021-11-06 11:45:00,30.629213,47056.988764,32669.393258,0.0,14387.595506,32669.393258,32669.393258,57.235955,807.224719,...,,,,,,,,,,
3,2021-11-06 12:00:00,31.021978,48415.835165,36126.516484,0.0,12289.318681,36126.516484,36126.516484,56.846154,817.527473,...,,,,,,,,,,
4,2021-11-06 12:15:00,31.831461,49345.404494,36879.966292,0.0,12465.438202,36879.966292,36879.966292,55.842697,816.651685,...,,,,,,,,,,


In [248]:
date_col = 'datetime' 
df = df.rename(columns={df.columns[0]: date_col})

# Ensure timestamp column is datetime type
if df[date_col].dtype != 'datetime64[ns]':
    df[date_col] = pd.to_datetime(df[date_col])
    
row_count = df.shape[0]
print(f"Total rows: {row_count}")
df.head(5)

Total rows: 138238


Unnamed: 0,datetime,ambient_temp,current_power,value_of_consumption,external_energy_supply,grid_feed_in,internal_power_supply,self_consumption,module_temp,total_irradiation,...,sp,sshf,ssr,ssrd,str,strd,t2m,tp,u10,v10
0,2021-11-06 11:15:00,30.288889,46519.355556,37280.422222,0.0,9238.933333,37280.422222,37280.422222,58.022222,798.444444,...,,,,,,,,,,
1,2021-11-06 11:30:00,30.076923,47600.714286,39472.626374,0.0,9397.756098,39132.406593,39132.406593,58.263736,812.571429,...,,,,,,,,,,
2,2021-11-06 11:45:00,30.629213,47056.988764,32669.393258,0.0,14387.595506,32669.393258,32669.393258,57.235955,807.224719,...,,,,,,,,,,
3,2021-11-06 12:00:00,31.021978,48415.835165,36126.516484,0.0,12289.318681,36126.516484,36126.516484,56.846154,817.527473,...,,,,,,,,,,
4,2021-11-06 12:15:00,31.831461,49345.404494,36879.966292,0.0,12465.438202,36879.966292,36879.966292,55.842697,816.651685,...,,,,,,,,,,


In [249]:
print(df[date_col].dt.year.unique())
print(df[date_col].dt.year.value_counts())

[2021 2022 2023 2024 2025]
datetime
2024    35136
2023    35040
2022    35040
2025    27691
2021     5331
Name: count, dtype: int64


In [250]:
# ให้แน่ใจว่า index เป็น DatetimeIndex
df.index = pd.to_datetime(df[date_col])
df = df.sort_index()
print(df.shape)
df.head(5)

(138238, 28)


Unnamed: 0_level_0,datetime,ambient_temp,current_power,value_of_consumption,external_energy_supply,grid_feed_in,internal_power_supply,self_consumption,module_temp,total_irradiation,...,sp,sshf,ssr,ssrd,str,strd,t2m,tp,u10,v10
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-06 11:15:00,2021-11-06 11:15:00,30.288889,46519.355556,37280.422222,0.0,9238.933333,37280.422222,37280.422222,58.022222,798.444444,...,,,,,,,,,,
2021-11-06 11:30:00,2021-11-06 11:30:00,30.076923,47600.714286,39472.626374,0.0,9397.756098,39132.406593,39132.406593,58.263736,812.571429,...,,,,,,,,,,
2021-11-06 11:45:00,2021-11-06 11:45:00,30.629213,47056.988764,32669.393258,0.0,14387.595506,32669.393258,32669.393258,57.235955,807.224719,...,,,,,,,,,,
2021-11-06 12:00:00,2021-11-06 12:00:00,31.021978,48415.835165,36126.516484,0.0,12289.318681,36126.516484,36126.516484,56.846154,817.527473,...,,,,,,,,,,
2021-11-06 12:15:00,2021-11-06 12:15:00,31.831461,49345.404494,36879.966292,0.0,12465.438202,36879.966292,36879.966292,55.842697,816.651685,...,,,,,,,,,,


In [251]:
print(df.index.year.unique())
print(df.index.year.value_counts())

Index([2021, 2022, 2023, 2024, 2025], dtype='int32', name='datetime')
datetime
2024    35136
2023    35040
2022    35040
2025    27691
2021     5331
Name: count, dtype: int64


In [252]:
print(df.index.min())
print(df.index.max())

2021-11-06 11:15:00
2025-10-16 10:30:00


In [253]:
# Try inference first
interval_time = pd.infer_freq(df.index)
print(interval_time)

if interval_time is None:
    delta = df.index.to_series().diff().dropna()
    d = delta.mode().iloc[0] if not delta.mode().empty else delta.median()
    if d % pd.Timedelta(days=1) == pd.Timedelta(0):
        interval_time = 'D'
    elif d % pd.Timedelta(hours=1) == pd.Timedelta(0):
        interval_time = f"{int(d / pd.Timedelta(hours=1))}h"
    elif d % pd.Timedelta(minutes=1) == pd.Timedelta(0):
        interval_time = f"{int(d / pd.Timedelta(minutes=1))}min"
    else:
        interval_time = f"{int(d.total_seconds())}s"
print(interval_time)

start_date = df.index.min().floor(interval_time)
end_date = df.index.max().floor(interval_time)
complete_index = pd.date_range(start=start_date, end=end_date, freq=interval_time)
print(complete_index.shape)
print(complete_index.min())
print(complete_index.max())

15min
15min
(138238,)
2021-11-06 11:15:00
2025-10-16 10:30:00


In [254]:
# Create a DataFrame with the complete index
df_complete = pd.DataFrame(index=complete_index)
# Join with original data
df_with_all_times = df_complete.join(df)
df_with_all_times = df_with_all_times.sort_index()

print(df.shape)
print(df_with_all_times.shape)

(138238, 28)
(138238, 28)


In [255]:
# ตรวจค่าซ้ำ
dup_mask = df_with_all_times.index.duplicated(keep='first')
print("มี index ซ้ำ:", dup_mask.any())

# ถ้ามี ให้ลบทิ้งหรือ aggregate ตามต้องการ
df_with_all_times = df_with_all_times[~dup_mask]
print(df_with_all_times.shape)

มี index ซ้ำ: False
(138238, 28)


In [256]:
# Print missing data statistics
missing_count = df_with_all_times.isna().any(axis=1).sum()
print(f"Found {missing_count} missing timestamps out of {len(complete_index)} expected")

Found 5769 missing timestamps out of 138238 expected


In [257]:
# นับจำนวน missing values ก่อนทำการ fill
print("\nจำนวน Missing Values ก่อนทำการ fill:")
for col in df_with_all_times.columns:
    missing_count = df_with_all_times[col].isnull().sum()
    if missing_count > 0:
        print(f"  {col}: {missing_count} missing values")


จำนวน Missing Values ก่อนทำการ fill:
  cc: 530 missing values
  q: 530 missing values
  r: 530 missing values
  t: 530 missing values
  u: 530 missing values
  v: 530 missing values
  fal: 5239 missing values
  slhf: 5239 missing values
  sp: 5239 missing values
  sshf: 5239 missing values
  ssr: 5239 missing values
  ssrd: 5239 missing values
  str: 5239 missing values
  strd: 5239 missing values
  t2m: 5239 missing values
  tp: 5239 missing values
  u10: 5239 missing values
  v10: 5239 missing values


In [258]:
# Identify numeric columns
numeric_cols = df_with_all_times.select_dtypes(include=['number']).columns

# ทำการ fill ข้อมูลตาม strategy ที่กำหนด
for col in numeric_cols:
    df_with_all_times[col] = df_with_all_times[col].interpolate(method='cubic')

    # หมายเหตุ: สำหรับข้อมูลขอบสุดที่อาจหาค่าไม่ได้ (NaN) 
    # อาจต้องใช้ bfill() หรือ ffill() ปิดท้ายเล็กน้อย
    df_with_all_times = df_with_all_times.bfill().ffill()
    print(f"  ✅ {col}: cubic")

  ✅ ambient_temp: cubic
  ✅ current_power: cubic
  ✅ value_of_consumption: cubic
  ✅ external_energy_supply: cubic
  ✅ grid_feed_in: cubic
  ✅ internal_power_supply: cubic
  ✅ self_consumption: cubic
  ✅ module_temp: cubic
  ✅ total_irradiation: cubic
  ✅ cc: cubic
  ✅ q: cubic
  ✅ r: cubic
  ✅ t: cubic
  ✅ u: cubic
  ✅ v: cubic
  ✅ fal: cubic
  ✅ slhf: cubic
  ✅ sp: cubic
  ✅ sshf: cubic
  ✅ ssr: cubic
  ✅ ssrd: cubic
  ✅ str: cubic
  ✅ strd: cubic
  ✅ t2m: cubic
  ✅ tp: cubic
  ✅ u10: cubic
  ✅ v10: cubic


In [259]:
# ตรวจสอบผลลัพธ์หลัง fill
print("จำนวน Missing Values หลังทำการ fill:")
total_missing_after = 0
all_columns = []
for col in df_with_all_times.columns:
    missing_count = df_with_all_times[col].isnull().sum()
    total_missing_after += missing_count
    if missing_count > 0:
        print(f"  ⚠️  {col}: {missing_count} missing values (ยังเหลือ)")
    else:
        print(f"  ✅ {col}: ไม่มี missing values")

จำนวน Missing Values หลังทำการ fill:
  ✅ datetime: ไม่มี missing values
  ✅ ambient_temp: ไม่มี missing values
  ✅ current_power: ไม่มี missing values
  ✅ value_of_consumption: ไม่มี missing values
  ✅ external_energy_supply: ไม่มี missing values
  ✅ grid_feed_in: ไม่มี missing values
  ✅ internal_power_supply: ไม่มี missing values
  ✅ self_consumption: ไม่มี missing values
  ✅ module_temp: ไม่มี missing values
  ✅ total_irradiation: ไม่มี missing values
  ✅ cc: ไม่มี missing values
  ✅ q: ไม่มี missing values
  ✅ r: ไม่มี missing values
  ✅ t: ไม่มี missing values
  ✅ u: ไม่มี missing values
  ✅ v: ไม่มี missing values
  ✅ fal: ไม่มี missing values
  ✅ slhf: ไม่มี missing values
  ✅ sp: ไม่มี missing values
  ✅ sshf: ไม่มี missing values
  ✅ ssr: ไม่มี missing values
  ✅ ssrd: ไม่มี missing values
  ✅ str: ไม่มี missing values
  ✅ strd: ไม่มี missing values
  ✅ t2m: ไม่มี missing values
  ✅ tp: ไม่มี missing values
  ✅ u10: ไม่มี missing values
  ✅ v10: ไม่มี missing values


In [260]:
df_with_all_times.head(3)

Unnamed: 0,datetime,ambient_temp,current_power,value_of_consumption,external_energy_supply,grid_feed_in,internal_power_supply,self_consumption,module_temp,total_irradiation,...,sp,sshf,ssr,ssrd,str,strd,t2m,tp,u10,v10
2021-11-06 11:15:00,2021-11-06 11:15:00,30.288889,46519.355556,37280.422222,0.0,9238.933333,37280.422222,37280.422222,58.022222,798.444444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-11-06 11:30:00,2021-11-06 11:30:00,30.076923,47600.714286,39472.626374,0.0,9397.756098,39132.406593,39132.406593,58.263736,812.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-11-06 11:45:00,2021-11-06 11:45:00,30.629213,47056.988764,32669.393258,0.0,14387.595506,32669.393258,32669.393258,57.235955,807.224719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
print("จัดการ missing values ที่เหลือด้วยวิธีเสริม...")
# สำหรับข้อมูลตัวเลข: ใช้ค่าเฉลี่ยของข้อมูลที่อยู่รอบๆ
numeric_cols = df_with_all_times.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    if df_with_all_times[col].isnull().sum() > 0:
        # ใช้ interpolation สำหรับข้อมูลตัวเลข
        df_with_all_times[col] = df_with_all_times[col].interpolate(method='linear')
        remaining = df_with_all_times[col].isnull().sum()
        if remaining > 0:
            # หากยังมี missing อยู่ให้ใช้ค่าเฉลี่ย
            df_with_all_times[col] = df_with_all_times[col].fillna(df_with_all_times[col].mean())
        print(f"  ✅ {col}: interpolate + fillna(mean)")

จัดการ missing values ที่เหลือด้วยวิธีเสริม...


In [262]:
# Print missing data statistics
missing_count = df_with_all_times.isna().any(axis=1).sum()
print(f"Found {missing_count} missing timestamps out of {len(complete_index)} expected")

Found 0 missing timestamps out of 138238 expected


In [263]:
nan_rows = df_with_all_times[df_with_all_times.isna().any(axis=1)]
nan_rows

Unnamed: 0,datetime,ambient_temp,current_power,value_of_consumption,external_energy_supply,grid_feed_in,internal_power_supply,self_consumption,module_temp,total_irradiation,...,sp,sshf,ssr,ssrd,str,strd,t2m,tp,u10,v10


In [264]:
# Replace all NaN values in the DataFrame with 0
df_with_all_times = df_with_all_times.fillna(0)

In [265]:
# Count NaNs in each column
print(df_with_all_times.isna().sum())

datetime                  0
ambient_temp              0
current_power             0
value_of_consumption      0
external_energy_supply    0
grid_feed_in              0
internal_power_supply     0
self_consumption          0
module_temp               0
total_irradiation         0
cc                        0
q                         0
r                         0
t                         0
u                         0
v                         0
fal                       0
slhf                      0
sp                        0
sshf                      0
ssr                       0
ssrd                      0
str                       0
strd                      0
t2m                       0
tp                        0
u10                       0
v10                       0
dtype: int64


In [266]:
df_with_all_times.min()

datetime                  2021-11-06 11:15:00
ambient_temp                              0.0
current_power                             0.0
value_of_consumption                      0.0
external_energy_supply                    0.0
grid_feed_in                              0.0
internal_power_supply                     0.0
self_consumption                          0.0
module_temp                               0.0
total_irradiation                         0.0
cc                                  -0.110163
q                                    0.000937
r                                    5.090677
t                                  280.777981
u                                  -12.051889
v                                   -11.03489
fal                                 -0.028204
slhf                         -15280650.573819
sp                              -18516.034518
sshf                          -8831089.820976
ssr                           -3413031.227638
ssrd                          -399

In [267]:
df_with_all_times.max()

datetime                  2025-10-16 10:30:00
ambient_temp                        40.010989
current_power                     78259.89011
value_of_consumption            145244.764045
external_energy_supply          145244.764045
grid_feed_in                    151143.101124
internal_power_supply            74895.123596
self_consumption                 74895.123596
module_temp                         61.373626
total_irradiation                 1183.054945
cc                                   1.119586
q                                    0.017467
r                                  102.171829
t                                  303.340049
u                                   16.371889
v                                   13.513545
fal                                  0.194048
slhf                            1704576.39175
sp                              114096.239025
sshf                            916058.336064
ssr                           24950505.042308
ssrd                          2993

In [268]:
len(df_with_all_times)

138238

In [269]:
filename = Path(csv_path).name
base = re.split(r'\.', filename, maxsplit=1)[0]
print(base)

cmmju_15min


In [270]:
# Compressed CSV
df_with_all_times.to_csv('C:\\Users\\ASUS\\.keras\\datasets\\' + base + '_filled.csv', index=False)