In [1]:
import pandas as pd
import numpy as np
import chardet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [26]:
raw = pd.read_csv('./data/climate/korea_climate_cloud.csv',index_col = False)
df_irradiance = raw
print(df_irradiance)

         지점  지점명          일시  평균기온(°C)  평균 상대습도(%)  합계 일조시간(hr)  \
0        90   속초  2014-12-01       1.9        39.3          8.6   
1        90   속초  2014-12-02      -3.7        26.5          8.9   
2        90   속초  2014-12-03       0.3        37.1          8.2   
3        90   속초  2014-12-04      -1.3        29.5          9.0   
4        90   속초  2014-12-05      -3.0        31.3          9.0   
...     ...  ...         ...       ...         ...          ...   
315021  296  북부산  2023-12-27       4.4        62.4          7.8   
315022  296  북부산  2023-12-28       3.8        66.6          8.4   
315023  296  북부산  2023-12-29       2.5        66.1          8.4   
315024  296  북부산  2023-12-30       4.2        73.8          NaN   
315025  296  북부산  2023-12-31       6.3        73.9          4.0   

        합계 일사량(MJ/m2)  평균 전운량(1/10)  
0                 NaN           3.4  
1                 NaN           0.3  
2                 NaN           1.6  
3                 NaN           0.8  
4    

In [27]:
df_irradiance = df_irradiance.dropna()
df_irradiance = df_irradiance.drop(columns=['지점명'])
label_encoder = LabelEncoder()
df_irradiance['일시'] = label_encoder.fit_transform(df_irradiance['일시'])
print(df_irradiance)

         지점    일시  평균기온(°C)  평균 상대습도(%)  합계 일조시간(hr)  합계 일사량(MJ/m2)  \
2818     90  2818      24.8        77.8          0.7           0.00   
3318     93   670      18.6        79.3          3.8          11.07   
3319     93   671      17.0        89.0          0.0           3.19   
3320     93   672      20.5        81.1          7.5          16.29   
3321     93   673      18.3        72.4         10.6          19.79   
...     ...   ...       ...         ...          ...            ...   
315017  296  3309      -2.6        45.8          7.8          10.85   
315021  296  3313       4.4        62.4          7.8          10.59   
315022  296  3314       3.8        66.6          8.4          10.60   
315023  296  3315       2.5        66.1          8.4          10.96   
315025  296  3317       6.3        73.9          4.0           7.93   

        평균 전운량(1/10)  
2818             8.4  
3318             7.5  
3319             9.4  
3320             4.8  
3321             1.8  
...      

In [22]:
X = df_irradiance.drop(columns=['합계 일사량(MJ/m2)'])
y = df_irradiance['합계 일사량(MJ/m2)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =42 )


In [23]:
#하이퍼파라미터 그리드 설정
param_grid = {
    #already done 100,400
    'n_estimators':[800,1000,1200],
    #already done 0.05,0.2
    'learning_rate':[0.1,],
    #already done 3,6,11
    'max_depth':[9],
    # 'subsample' :[0.6,0.8,1.0],
    # 'min_child_weight': [1, 5, 10],
    # 'colsample_bytree': [0.6, 0.8, 1.0],

}
model = XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv = 5, verbose=3)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

print(f"Best parameters: {best_params}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END learning_rate=0.1, max_depth=9, n_estimators=800;, score=-1.393 total time=   3.0s
[CV 2/5] END learning_rate=0.1, max_depth=9, n_estimators=800;, score=-1.395 total time=   3.0s
[CV 3/5] END learning_rate=0.1, max_depth=9, n_estimators=800;, score=-1.393 total time=   3.1s
[CV 4/5] END learning_rate=0.1, max_depth=9, n_estimators=800;, score=-1.401 total time=   2.9s
[CV 5/5] END learning_rate=0.1, max_depth=9, n_estimators=800;, score=-1.417 total time=   3.0s
[CV 1/5] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=-1.392 total time=   3.7s
[CV 2/5] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=-1.395 total time=   3.7s
[CV 3/5] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=-1.393 total time=   3.7s
[CV 4/5] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=-1.400 total time=   3.5s
[CV 5/5] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=-

In [36]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions)

Mean Squared Error (MSE): 1.911142744109557
Mean Absolute Error (MAE): 0.983427595806852
Root Mean Squared Error (RMSE): 1.3824408645976713
R-squared (R2): 0.9634483784062944
        Actual  Predicted
140785   13.80  12.144712
50563     8.78   8.481064
43507     6.42   6.460678
121524    9.96   6.976803
74698    24.70  25.311760
...        ...        ...
102528    3.94   2.316966
260444    3.39   4.024034
113122   12.52  14.253346
257802    7.97   8.940817
140028    8.29   6.768033

[28963 rows x 2 columns]


In [32]:
df_irradiance_predict = raw.drop(columns=['지점명'])

# '합계 일사량(MJ/m2)' 열에 NaN이 있는 행들을 필터링
nan_rows = df_irradiance_predict[df_irradiance_predict['합계 일사량(MJ/m2)'].isna()]

# 다른 열들에 NaN이 없는 행들만 선택
non_nan_columns = nan_rows.columns.difference(['합계 일사량(MJ/m2)'])
df_irradiance_predict = nan_rows.dropna(subset=non_nan_columns)

df_irradiance_predict['일시'] = label_encoder.fit_transform(df_irradiance_predict['일시'])
df_irradiance_predict = df_irradiance_predict.drop(columns = ['합계 일사량(MJ/m2)'])
print(df_irradiance_predict)

         지점    일시  평균기온(°C)  평균 상대습도(%)  합계 일조시간(hr)  평균 전운량(1/10)
0        90     0       1.9        39.3          8.6           3.4
1        90     1      -3.7        26.5          8.9           0.3
2        90     2       0.3        37.1          8.2           1.6
3        90     3      -1.3        29.5          9.0           0.8
4        90     4      -3.0        31.3          9.0           0.0
...     ...   ...       ...         ...          ...           ...
314668  295  3315       3.8        66.4          8.0           2.5
314669  295  3316       4.9        81.5          5.9           5.6
314670  295  3317       6.9        71.3          2.6           4.8
314993  296  3285       5.3        58.5          0.0           7.8
314998  296  3290       4.4        61.5          0.0           2.5

[168992 rows x 6 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_irradiance_predict['일시'] = label_encoder.fit_transform(df_irradiance_predict['일시'])


In [39]:

#원본과 같은 인덱스인지 검사 
# for index in df_irradiance_predict.index:
#     if(df_irradiance_predict.loc[index, '평균기온(°C)']==raw.loc[index, '평균기온(°C)']):
#        print("일치")

#실제 일사량 결측치 대입
for index in df_irradiance_predict.index:
    row = df_irradiance_predict.loc[index].values.reshape(1,-1)
    y = best_model.predict(row)
    raw.loc[index, '합계 일사량(MJ/m2)'] = y
print(raw)
raw.to_csv('./data/climate/korea_climate_irradiance.csv',index = False)

         지점  지점명          일시  평균기온(°C)  평균 상대습도(%)  합계 일조시간(hr)  \
0        90   속초  2014-12-01       1.9        39.3          8.6   
1        90   속초  2014-12-02      -3.7        26.5          8.9   
2        90   속초  2014-12-03       0.3        37.1          8.2   
3        90   속초  2014-12-04      -1.3        29.5          9.0   
4        90   속초  2014-12-05      -3.0        31.3          9.0   
...     ...  ...         ...       ...         ...          ...   
315021  296  북부산  2023-12-27       4.4        62.4          7.8   
315022  296  북부산  2023-12-28       3.8        66.6          8.4   
315023  296  북부산  2023-12-29       2.5        66.1          8.4   
315024  296  북부산  2023-12-30       4.2        73.8          NaN   
315025  296  북부산  2023-12-31       6.3        73.9          4.0   

        합계 일사량(MJ/m2)  평균 전운량(1/10)  
0            9.593059           3.4  
1           11.134921           0.3  
2           10.280556           1.6  
3           10.913384           0.8  
4    