In [12]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from scipy import sparse
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from ngboost import NGBRegressor
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

plt.rcParams['font.family'] = 'Malgun Gothic'

In [8]:
all_data = pd.read_csv('asos_train.csv', encoding='cp949')


relevant_loc = ['영암에프원태양광b']

all_data = all_data[all_data['name'].isin(relevant_loc)]

all_data.fillna(-1, inplace=True)

all_data['date'] = pd.to_datetime(all_data['date'])
all_data['year'] = all_data['date'].dt.year
all_data['month'] = all_data['date'].dt.month
all_data['hour'] = all_data['date'].dt.hour
all_data['day'] = all_data['date'].dt.day


drop_columns = [ "asos_num", "location", "capacities", "land", "power/land",
               'Wind_Direction(16 compass points)', "Date/Time",
               'Sea-level_Pressure(hPa)',
               'Mid-Low_Cloud_Cover', 'Cloud_Form', 'Lowest_Cloud_Height(100m)', 'Ground_Condition', 'Phenomenon_Number',
                 '5cm_Underground_Temp', '10cm_Underground_Temp',
               '20cm_Underground_Temp', '30cm_Underground_Temp']

all_data = all_data.drop(columns=drop_columns)

all_data_train = all_data[all_data['year'].isin([2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])]
all_data_test = all_data[all_data['year'].isin([2021,2022])]

all_data_train.sort_values(by='date', inplace=True)
all_data_test.sort_values(by='date', inplace=True)

X_train = all_data_train.drop(columns=['power', 'name', 'date'])
y_train = all_data_train['power']
X_test = all_data_test.drop(columns=['power', 'name', 'date'])
y_test = all_data_test['power']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_train.sort_values(by='date', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_test.sort_values(by='date', inplace=True)


In [4]:
all_data.columns

Index(['power', 'date', 'name', 'power/land', 'Temp(C)', 'Prec(mm)',
       'Wind_speed(m/s)', 'Humidity', 'Vapor_pressure(hPa)', 'Dew_Point(C)',
       'Local Atmospheric Pressure(hPa)', 'sunshine(hr)',
       'Solar_Radiation(MJ/m2)', 'Snowfall(cm)', '3-hour_Fresh_Snowfall',
       'Cloud_Cover(1/10)', 'Visibility(10m)', 'Ground Temp', 'year', 'month',
       'hour', 'day'],
      dtype='object')

In [9]:
from itertools import combinations
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt

def evaluate(true, preds):
    mae = mean_absolute_error(true, preds)
    mse = mean_squared_error(true, preds)
    rmse = sqrt(mse)
    r2 = r2_score(true, preds)
    return mae, mse, rmse, r2


In [10]:
mandatory_features = ['Temp(C)', 'Solar_Radiation(MJ/m2)', 'Wind_speed(m/s)']
combinatorial_features = ['Prec(mm)', 'Humidity', 'Vapor_pressure(hPa)', 'Dew_Point(C)',
                          'Local Atmospheric Pressure(hPa)', 'sunshine(hr)',
                          'Snowfall(cm)', 
                          'Cloud_Cover(1/10)', 'Visibility(10m)', 'year', 'month',
                          'hour']

In [13]:
results = []

for r in range(1, len(combinatorial_features) + 1):
    for subset in combinations(combinatorial_features, r):
        used_features = list(subset) + mandatory_features
        print(f"Using features: {used_features}")

        X_train_subset = X_train[used_features]
        X_test_subset = X_test[used_features]

        ngb = NGBRegressor()
        ngb.fit(X_train_subset, y_train)
        y_preds = ngb.predict(X_test_subset)

        mae, mse, rmse, r2 = evaluate(y_test, y_preds)
        results.append({
            'features': used_features,
            'mae': mae,
            'mse': mse,
            'rmse': rmse,
            'r2': r2
        })

# 결과를 DataFrame으로 변환하여 보기 좋게 출력
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="mae"))  # MAE 기준으로 정렬하여 출력


Using features: ['Prec(mm)', 'Temp(C)', 'Solar_Radiation(MJ/m2)', 'Wind_speed(m/s)']
[iter 0] loss=16.4519 val_loss=0.0000 scale=1.0000 norm=2963398.9849
[iter 100] loss=15.5582 val_loss=0.0000 scale=2.0000 norm=1759705.4016
[iter 200] loss=15.0864 val_loss=0.0000 scale=2.0000 norm=1294378.0113
[iter 300] loss=14.8624 val_loss=0.0000 scale=2.0000 norm=1241497.3725
[iter 400] loss=14.7702 val_loss=0.0000 scale=1.0000 norm=614939.8877
Using features: ['Humidity', 'Temp(C)', 'Solar_Radiation(MJ/m2)', 'Wind_speed(m/s)']
[iter 0] loss=16.4519 val_loss=0.0000 scale=1.0000 norm=2963398.9849
[iter 100] loss=15.5574 val_loss=0.0000 scale=2.0000 norm=1757664.1043
[iter 200] loss=15.0806 val_loss=0.0000 scale=2.0000 norm=1289983.7558
[iter 300] loss=14.8534 val_loss=0.0000 scale=2.0000 norm=1234779.6233
[iter 400] loss=14.7608 val_loss=0.0000 scale=1.0000 norm=611173.5171
Using features: ['Vapor_pressure(hPa)', 'Temp(C)', 'Solar_Radiation(MJ/m2)', 'Wind_speed(m/s)']
[iter 0] loss=16.4519 val_loss

[iter 100] loss=15.5586 val_loss=0.0000 scale=2.0000 norm=1754897.3148
[iter 200] loss=15.0754 val_loss=0.0000 scale=2.0000 norm=1271914.0039
[iter 300] loss=14.8425 val_loss=0.0000 scale=2.0000 norm=1213486.6494
[iter 400] loss=14.7510 val_loss=0.0000 scale=2.0000 norm=1199936.4651
Using features: ['Prec(mm)', 'Visibility(10m)', 'Temp(C)', 'Solar_Radiation(MJ/m2)', 'Wind_speed(m/s)']
[iter 0] loss=16.4519 val_loss=0.0000 scale=1.0000 norm=2963398.9849
[iter 100] loss=15.5581 val_loss=0.0000 scale=2.0000 norm=1759705.4016


KeyboardInterrupt: 