In [1]:
import pandas as pd
import numpy as np
import os
inpath = "../data/ASOS108"
fnlist = os.listdir(inpath)
dfs = []
for fn in fnlist:
    df = pd.read_csv(os.path.join(inpath, fn), encoding='cp949')
    dfs.append(df)
df_raw = pd.concat(dfs)

In [2]:
use_col = ['일시', '평균기온(°C)', '최저기온(°C)', '최저기온 시각(hhmi)', '최고기온(°C)', '최고기온 시각(hhmi)', '일강수량(mm)', '평균 풍속(m/s)', 
           '최다풍향(16방위)', '평균 이슬점온도(°C)', '최소 상대습도(%)', '최소 상대습도 시각(hhmi)', '평균 상대습도(%)', 
           '평균 증기압(hPa)', '평균 현지기압(hPa)', '최고 해면기압(hPa)', '최고 해면기압 시각(hhmi)', '최저 해면기압(hPa)', '최저 해면기압 시각(hhmi)',
           '평균 해면기압(hPa)', '평균 전운량(1/10)', '합계 일사량(MJ/m2)']
df_raw['일강수량(mm)'] = df_raw['일강수량(mm)'].fillna(0)
df_raw1 = df_raw[use_col].dropna()
df_raw1['datetime'] = pd.to_datetime(df_raw1['일시'])
df_raw1['year'] = df_raw1['datetime'].dt.year
df_raw1['month'] = df_raw1['datetime'].dt.month
df_raw1['day'] = df_raw1['datetime'].dt.day
df_raw1 = df_raw1.sort_values(by='datetime')

target = '평균기온(°C)'
df_raw1['Y'] = df_raw1[target].shift(-1)
df_raw1 = df_raw1.iloc[:-1]


In [3]:
# https://www.kaggle.com/code/enisezengin/autogluon-privatescore-2-97008
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm
fn_font = 'NanumGothic.ttf'
fontprop = fm.FontProperties(fname=fn_font, size=10)

def plot_distribution(df, features, figsize_org=(3, 4), **kwargs):
    n_feats = len(features)
    
    num_cols = n_feats if n_feats < 4 else 2 if n_feats == 4 else 3
    num_rows = int(np.ceil(n_feats / num_cols))
    figsize = (figsize_org[0] * num_cols * 2, figsize_org[1] * num_rows)
    plt.figure(figsize=figsize)
    for i, feature in enumerate(features):
        plt.subplot(num_rows, num_cols, i + 1)
        sns.histplot(df[feature], kde=True, **kwargs)
        plt.ylabel('Frequency', fontproperties= fontprop)
        plt.xlabel(feature, fontproperties=fontprop)

num_features = df_raw1.select_dtypes(exclude=['object', 'datetime64[ns]', 'int32']).columns
#plot_distribution(df_raw1, num_features[1:])

In [4]:
# font_path = "NanumGothic.ttf"
# fontprop = fm.FontProperties(fname=font_path)
# plt.rc('font', family=fontprop.get_name())


In [5]:
target = '평균기온(°C)'
# target_corr =  corr_matrix[target].abs().sort_values()
# low_corr_features = target_corr[target_corr < 0.1].index.tolist()
# print(f'Low corr features: {low_corr_features}')

In [6]:
target = '평균기온(°C)'

def split_data(df, date):
    datetime = df['datetime'][df['datetime'] >= date]
    train_df = df[df['datetime'] < date].drop('datetime', axis=1)
    test_df = df[df['datetime'] >= date].drop('datetime', axis=1)
    train_X = train_df.drop('Y', axis=1)
    train_y = train_df['Y']
    test_X = test_df.drop('Y', axis=1)
    test_y = test_df['Y']
    return train_X, train_y, test_X, test_y, datetime
df = df_raw1.drop(['일시'], axis=1)
train_X, train_y, test_X, test_y, datetime = split_data(df, '2022-01-01')
print(len(train_X), len(test_X))

9828 1088


In [7]:
import plotly.express as px
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from scipy.stats import pearsonr

model = LinearRegression()
model.fit(train_X, train_y)
predictions = model.predict(test_X)


rmse = np.sqrt(mean_squared_error(test_y, predictions))
corr, _ = pearsonr(test_y, predictions)

fig = px.scatter(x=test_y, y=predictions, labels={'x': 'Actual', 'y': 'Predicted'}, 
                 title=f'Real vs Predicted (RMSE: {rmse:.2f}, Corr: {corr:.2f})')
fig.update_layout(
    xaxis=dict(scaleanchor="y", range=[-15, 35]),
    yaxis=dict(scaleanchor="x", range=[-15, 35]),
    width = 600,
    height = 600
)

fig.update_traces(marker=dict(size=2, opacity=0.6, line=dict(width=1, color='DarkSlateGrey')), selector=dict(mode='markers'))
fig.show()

# RMSE와 상관계수 출력
print(f"RMSE: {rmse:.2f}")
print(f"Correlation: {corr:.2f}")



RMSE: 2.10
Correlation: 0.98


In [8]:
import plotly.graph_objects as go
tickvals = datetime[::30]  
ticktext = datetime.dt.strftime('%Y-%m-%d')[::30]
# 상관계수 계산
corr, _ = pearsonr(test_y, predictions)
fig = go.Figure()

# 실제값과 예측값 시각화
fig.add_trace(go.Scatter(x=datetime, y=test_y, mode='lines', name='Actual', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=datetime, y=predictions, mode='lines', name='Predicted', line=dict(color='red')))

fig.update_layout(
    title=f'Real vs Predicted (RMSE: {rmse:.2f}, Corr: {corr:.2f})'
)

In [9]:
corr, _ = pearsonr(test_y[:-1], predictions[1:])
print(corr)

0.9946690507872825
