In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows의 기본 한글 폰트
plt.rcParams['axes.unicode_minus'] = False     # 음수 기호 깨짐 방지

df = pd.read_csv("감자/감자(EDA용)_스케일링만.csv", encoding='cp949', parse_dates=["week_start"])
df['year'] = df['week_start'].dt.year
df['month'] = df['week_start'].dt.month
df['week'] = df['week_start'].dt.isocalendar().week

def get_season(month):
    if month in [3, 4, 5]:
        return '봄'
    elif month in [6, 7, 8]:
        return '여름'
    elif month in [9, 10, 11]:
        return '가을'
    else:
        return '겨울'

df['season'] = df['month'].apply(get_season)

In [None]:
df_filtered = df[df['year'].between(2020, 2024)]

weekly_avg = df_filtered.groupby(['week', 'year'])['평균단가(원)'].mean().unstack()

weekly_avg.plot(figsize=(12, 5), title = "연도별 주차 평균단가 (2020~2024년)")
plt.xlabel("주차 (1~52)")
plt.ylabel("평균단가")
plt.legend(title='연도', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.patches import Patch
import seaborn as sns
import re

plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows의 기본 한글 폰트
plt.rcParams['axes.unicode_minus'] = False     # 음수 기호 깨짐 방지

file_list = glob.glob('EDA/*.csv')

def get_season(month):
    if month in [3, 4, 5]:
        return '봄'
    elif month in [6, 7, 8]:
        return '여름'
    elif month in [9, 10, 11]:
        return '가을'
    else:
        return '겨울'
    
season_legend = {
    '봄': '3,4,5월',
    '여름': '6,7,8월',
    '가을': '9,10,11월',
    '겨울': '12,1,2월'
}

season_colors = {
    '봄': 'salmon',
    '여름': 'skyblue',
    '가을': 'gold',
    '겨울': 'lightgray'
}

for file_path in file_list:
    try:
        df = pd.read_csv(file_path, encoding = 'cp949', parse_dates=['week_start'])

        df['year'] = df['week_start'].dt.year
        df['month'] = df['week_start'].dt.month
        df['season'] = df['month'].apply(get_season)

        df_2024 = df[df['year'] == 2024]

        seasonal_avg = df_2024.groupby('season')['평균단가(원)'].mean().reindex(['봄', '여름', '가을', '겨울'])

        filename = os.path.basename(file_path)
        item_name = re.split(r'\(', filename)[0]

        plt.figure(figsize=(6, 4))

        seasons = seasonal_avg.index.tolist()
        values = seasonal_avg.values.tolist()
        colors = [season_colors[s] for s in seasons]

        plt.bar(range(len(seasons)), values, color=colors)

        xtick_labels = [f"{s}({season_legend[s]})" for s in seasons]
        plt.xticks(ticks=range(len(seasons)), labels=xtick_labels, rotation=0)

        plt.title(f"{item_name.strip()} - 2024년 계절별 평균단가")
        plt.xlabel("계절(월 범위)")
        plt.ylabel("평균단가(원)")
        plt.grid(axis='y')
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"⚠️ 파일 처리 중 오류 발생: {file_path}")
        print(f"에러 내용: {e}")

In [None]:
plt.figure(figsize=(11, 3.5))
sns.boxplot(x='season', y='평균단가(원)', data=df, order=['봄', '여름', '가을', '겨울'])
plt.title("계절별 평균단가 분포")
plt.xlabel("계절")
plt.ylabel("평균단가")
plt.grid(True)
plt.show()

In [None]:
file_list = glob.glob('EDA/*.csv')

for file_path in file_list:
    try:
        df = pd.read_csv(file_path, encoding='cp949', parse_dates=['week_start'])

        df['week'] = df['week_start'].dt.isocalendar().week

        weekly_avg = df.groupby('week')[['평균단가(원)', '총거래량(kg)']].mean().reset_index()

        filename = os.path.basename(file_path)
        item_name = re.split(r'\(', filename)[0].strip()

        fig, ax1 = plt.subplots(figsize=(10, 6))

        line1, = ax1.plot(weekly_avg['week'], weekly_avg['평균단가(원)'], color='tab:blue', marker='o', label='평균단가')
        ax1.set_xlabel("주차")
        ax1.set_ylabel("평균단가(원)", color='tab:blue')
        ax1.tick_params(axis='y', labelcolor='tab:blue')

        ax2 = ax1.twinx()
        line2, = ax2.plot(weekly_avg['week'], weekly_avg['총거래량(kg)'], color='tab:orange', marker='x', label='총거래량')
        ax2.set_ylabel("총거래량", color='tab:orange')
        ax2.tick_params(axis='y', labelcolor='tab:orange')
        
        lines = [line1, line2]
        labels = [line.get_label() for line in lines]
        ax1.legend(lines, labels, loc='upper right')

        plt.title(f"{item_name} - 주차별 평균단가 & 총거래량 (전체 연도 평균)")
        fig.tight_layout()
        plt.grid(True)
        plt.show()

    except Exception as e:
        print(f"⚠️ 파일 처리 중 오류 발생: {file_path}")
        print(f"에러 내용: {e}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

df = pd.read_csv("감자/감자(EDA용)_스케일링만.csv", encoding='cp949', parse_dates=["week_start"])

plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows의 기본 한글 폰트
plt.rcParams['axes.unicode_minus'] = False     # 음수 기호 깨짐 방지

df['year'] = df['week_start'].dt.year
df['month'] = df['week_start'].dt.month
df['week'] = df['week_start'].dt.isocalendar().week
df['dayofweek'] = df['week_start'].dt.dayofweek

In [None]:
target_col = '평균단가(원)'
X = df.drop(columns=[target_col, 'week_start'])  # week_start는 시간 인덱스이므로 제외
y = df[target_col]

In [None]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [None]:
for i, (train_idx, test_idx) in enumerate(tscv.split(df), 1):
    print(f"[Fold {i}]")
    print("Train 기간:", df.iloc[train_idx[0]]['week_start'], "→", df.iloc[train_idx[-1]]['week_start'])
    print("Test  기간:", df.iloc[test_idx[0]]['week_start'], "→", df.iloc[test_idx[-1]]['week_start'])

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

model = RandomForestRegressor(n_estimators=100, random_state=42)
tscv = TimeSeriesSplit(n_splits=5)

mae_list, rmse_list, r2_list = [], [], []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    mae_list.append(mae)
    rmse_list.append(rmse)
    r2_list.append(r2)

    print(f"[Fold {fold}] MAE: {mae:.2f} | RMSE: {rmse:.2f} | R²: {r2:.4f}")

In [None]:
model.fit(X, y)

importances = model.feature_importances_
feature_names = X.columns
sorted_idx = np.argsort(importances)

plt.figure(figsize=(8, len(feature_names)//2))
plt.barh(range(len(sorted_idx)), importances[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), feature_names[sorted_idx])
plt.title("Feature Importance (Random Forest)")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor

df = pd.read_csv("EDA/무(EDA용)_스케일링만.csv", encoding="cp949")

df = df.drop(columns=["등급코드"], errors="ignore")
df["week_start"] = pd.to_datetime(df["week_start"])
df = df.sort_values("week_start")

df_model["week_sin"] = np.sin(2 * np.pi * df_model["week"] / 52)
df_model["week_cos"] = np.cos(2 * np.pi * df_model["week"] / 52)

target_col = "평균단가(원)"
lag_features = [
    "일평균기온", "최고기온", "최저기온", "평균상대습도", "강수량(mm)", "1시간최고강수량(mm)",
    "일평균기온_t-1", "최고기온_t-1", "최저기온_t-1", "평균상대습도_t-1", "강수량(mm)_t-1", "1시간최고강수량(mm)_t-1",
    "일평균기온_t-2", "최고기온_t-2", "최저기온_t-2", "평균상대습도_t-2", "강수량(mm)_t-2", "1시간최고강수량(mm)_t-2",
    "일평균기온_t-3", "최고기온_t-3", "최저기온_t-3", "평균상대습도_t-3", "강수량(mm)_t-3", "1시간최고강수량(mm)_t-3"
]
derived_features = ["holiday_flag", "holiday_score", "grow_score"]
categorical_cols = ["품종코드", "직팜산지코드"]
numeric_features = ["총거래량(kg)"]

used_features = numeric_features + lag_features + derived_features + categorical_cols + ["year", "week", "week_start"]
df_model = df.dropna(subset=[target_col] + used_features)

df_model = pd.get_dummies(df_model, columns=categorical_cols)

In [None]:
y = df_model[target_col]
X = df_model.drop(columns=[target_col])

df_model = df_model[df_model["year"] <= 2024].copy()
y = df_model["평균단가(원)"]
X = df_model.drop(columns=["평균단가(원)", "year", "week", "week_start"], errors="ignore")
X = X.select_dtypes(include=[np.number]).astype(np.float32)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_model = XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    "max_depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "n_estimators": [100, 200],
    "subsample": [0.8, 1.0]
}

rmse_scorer = make_scorer(mean_squared_error, squared=False)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=3,  # 교차검증 fold 수
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_val)

rmse = mean_squared_error(y_val, y_pred, squared=False)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor

df = pd.read_csv("EDA/무(EDA용)_스케일링만.csv", encoding="cp949")

df = df.drop(columns=["등급코드"], errors="ignore")
df["week_start"] = pd.to_datetime(df["week_start"])
df = df.sort_values("week_start")

df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)

target_col = "평균단가(원)"
lag_features = [
    "일평균기온", "최고기온", "최저기온", "평균상대습도", "강수량(mm)", "1시간최고강수량(mm)",
    "일평균기온_t-1", "최고기온_t-1", "최저기온_t-1", "평균상대습도_t-1", "강수량(mm)_t-1", "1시간최고강수량(mm)_t-1",
    "일평균기온_t-2", "최고기온_t-2", "최저기온_t-2", "평균상대습도_t-2", "강수량(mm)_t-2", "1시간최고강수량(mm)_t-2",
    "일평균기온_t-3", "최고기온_t-3", "최저기온_t-3", "평균상대습도_t-3", "강수량(mm)_t-3", "1시간최고강수량(mm)_t-3"
]
derived_features = ["holiday_flag", "holiday_score", "grow_score"]
categorical_cols = ["품종코드", "직팜산지코드"]
numeric_features = ["총거래량(kg)", "week_sin", "week_cos"]

used_features = numeric_features + lag_features + derived_features + categorical_cols + ["year", "week", "week_start"]

In [None]:
df_model = df.dropna(subset=[target_col] + used_features).copy()

df_model = pd.get_dummies(df_model, columns=categorical_cols)

df_model = df_model[df_model["year"] <= 2024]
y = df_model["평균단가(원)"]
X = df_model.drop(columns=["평균단가(원)", "year", "week", "week_start"], errors="ignore")
X = X.select_dtypes(include=[np.number]).astype(np.float32)

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

param_grid = {
    "max_depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "n_estimators": [100, 200],
    "subsample": [0.8, 1.0]
}

In [None]:
rmse_scorer = make_scorer(mean_squared_error, squared=False)

grid_search = GridSearchCV(
    estimator=XGBRegressor(objective="reg:squarederror", random_state=42),
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=tscv,
    verbose=1,
    n_jobs=-1
)

In [None]:
grid_search.fit(X, y)

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv("EDA/무(EDA용)_스케일링만.csv", encoding="cp949")
df = df.drop(columns=["등급코드"], errors="ignore")
df["week_start"] = pd.to_datetime(df["week_start"])
df = df.sort_values("week_start")

df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)

target_col = "평균단가(원)"
lag_features = [
    "일평균기온", "최고기온", "최저기온", "평균상대습도", "강수량(mm)", "1시간최고강수량(mm)",
    "일평균기온_t-1", "최고기온_t-1", "최저기온_t-1", "평균상대습도_t-1", "강수량(mm)_t-1", "1시간최고강수량(mm)_t-1",
    "일평균기온_t-2", "최고기온_t-2", "최저기온_t-2", "평균상대습도_t-2", "강수량(mm)_t-2", "1시간최고강수량(mm)_t-2",
    "일평균기온_t-3", "최고기온_t-3", "최저기온_t-3", "평균상대습도_t-3", "강수량(mm)_t-3", "1시간최고강수량(mm)_t-3"
]
derived_features = ["holiday_flag", "holiday_score", "grow_score"]
categorical_cols = ["품종코드", "직팜산지코드"]
numeric_features = ["총거래량(kg)", "week_sin", "week_cos"]

used_features = numeric_features + lag_features + derived_features + categorical_cols + ["year", "week", "week_start"]

In [None]:
df_model = df.dropna(subset=[target_col] + used_features).copy()
df_model = pd.get_dummies(df_model, columns=categorical_cols)

df_model = df_model[df_model["year"] <= 2024]
y = df_model["평균단가(원)"]
X = df_model.drop(columns=["평균단가(원)", "year", "week", "week_start"], errors="ignore")
X = X.select_dtypes(include=[np.number]).astype(np.float32)

In [None]:
best_model = XGBRegressor(
    objective="reg:squarederror",
    learning_rate=0.2,
    max_depth=8,
    n_estimators=200,
    subsample=0.8,
    random_state=42
)
best_model.fit(X, y)

In [None]:
y_pred = best_model.predict(X)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv("EDA/무(EDA용)_스케일링만.csv", encoding="cp949")
df = df.drop(columns=["등급코드"], errors="ignore")
df["week_start"] = pd.to_datetime(df["week_start"])
df = df.sort_values("week_start")

df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)

target_col = "평균단가(원)"
lag_features = [
    "일평균기온", "최고기온", "최저기온", "평균상대습도", "강수량(mm)", "1시간최고강수량(mm)",
    "일평균기온_t-1", "최고기온_t-1", "최저기온_t-1", "평균상대습도_t-1", "강수량(mm)_t-1", "1시간최고강수량(mm)_t-1",
    "일평균기온_t-2", "최고기온_t-2", "최저기온_t-2", "평균상대습도_t-2", "강수량(mm)_t-2", "1시간최고강수량(mm)_t-2",
    "일평균기온_t-3", "최고기온_t-3", "최저기온_t-3", "평균상대습도_t-3", "강수량(mm)_t-3", "1시간최고강수량(mm)_t-3"
]
derived_features = ["holiday_flag", "holiday_score", "grow_score"]
categorical_cols = ["품종코드", "직팜산지코드"]
numeric_features = ["총거래량(kg)", "week_sin", "week_cos"]
used_features = numeric_features + lag_features + derived_features + categorical_cols + ["year", "week", "week_start"]

df_model = df.dropna(subset=[target_col] + used_features).copy()
df_model = pd.get_dummies(df_model, columns=categorical_cols)

latest_date = df_model["week_start"].max()
cutoff_date = latest_date - pd.Timedelta(weeks=52)
test_df = df_model[df_model["week_start"] > cutoff_date].copy()

train_df = df_model[df_model["week_start"] <= cutoff_date].copy()

X_train = train_df.drop(columns=[target_col, "year", "week", "week_start"], errors="ignore")
y_train = train_df[target_col]
X_test = test_df.drop(columns=[target_col, "year", "week", "week_start"], errors="ignore")
y_test = test_df[target_col]

X_train = X_train.select_dtypes(include=[np.number]).astype(np.float32)
X_test = X_test.select_dtypes(include=[np.number]).astype(np.float32)

model = XGBRegressor(
    objective="reg:squarederror",
    learning_rate=0.2,
    max_depth=8,
    n_estimators=200,
    subsample=0.8,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

test_df = test_df.copy()
test_df["예측값"] = y_pred
test_df["실제값"] = y_test.values

recent_weeks = (
    test_df[["year", "week", "week_start"]]
    .drop_duplicates()
    .sort_values("week_start")
    .tail(52)
    .reset_index(drop=True)
)
recent_weeks["연도_주차"] = recent_weeks["year"].astype(str) + "-" + recent_weeks["week"].astype(str).str.zfill(2)

weekly_result = (
    test_df.groupby(["year", "week"])[["실제값", "예측값"]]
    .mean()
    .reset_index()
)
weekly_result["연도_주차"] = weekly_result["year"].astype(str) + "-" + weekly_result["week"].astype(str).str.zfill(2)

plot_df = recent_weeks.merge(weekly_result, on=["year", "week", "연도_주차"], how="left")

plt.figure(figsize=(16, 6))
plt.plot(plot_df["연도_주차"], plot_df["실제값"], label="실제 평균단가", marker="o")
plt.plot(plot_df["연도_주차"], plot_df["예측값"], label="예측 평균단가", marker="x")
plt.title("예측 vs 실제 평균단가 (최근 1년, 주차별)")
plt.xlabel("연도-주차")
plt.ylabel("평균단가(원)")
plt.xticks(rotation=45, fontsize=9)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd

future_weeks = [23, 24, 25, 26, 27, 28, 29, 30]
base_year = 2025
last_week_start = df_model[df_model["year"] == 2025]["week_start"].max()

template = df_model[df_model["year"] == 2025].copy()
recent_avg = template.select_dtypes(include=[np.number]).mean()

future_rows = []
for week in future_weeks:
    row = recent_avg.copy()
    row["year"] = base_year
    row["week"] = week
    row["week_sin"] = np.sin(2 * np.pi * week / 52)
    row["week_cos"] = np.cos(2 * np.pi * week / 52)
    row["week_start"] = last_week_start + pd.Timedelta(weeks=week - 22)  # 22주차 이후부터 계속
    future_rows.append(row)

future_input = pd.DataFrame(future_rows)

for col in X_train.columns:
    if col not in future_input.columns:
        future_input[col] = 0  # 원핫인코딩 누락 대응

X_future = future_input[X_train.columns].astype(np.float32)

future_preds = model.predict(X_future)

result_df = pd.DataFrame({
    "year": future_input["year"],
    "week": future_input["week"],
    "week_start": future_input["week_start"],
    "예측값": future_preds
})

result_df["연도_주차"] = (
    result_df["year"].astype(int).astype(str) + "-" + result_df["week"].astype(int).astype(str).str.zfill(2)
)

result_df = result_df[["연도_주차", "예측값", "week_start"]]
display(result_df)

In [None]:
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

plt.figure(figsize=(10, 5))
plt.plot(result_df["연도_주차"], result_df["예측값"], marker="o", color="tomato", label="예측 평균단가")
plt.title("2025년 23~30주차 예측 평균단가")
plt.xlabel("연도-주차")
plt.ylabel("평균단가(원)")
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv("EDA/무(EDA용)_스케일링만.csv", encoding="cp949")
df = df.drop(columns=["등급코드"], errors="ignore")
df["week_start"] = pd.to_datetime(df["week_start"])
df = df.sort_values("week_start")

df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)

group_cols = ["직팜산지코드"]
df["평균단가(원)_lag1"] = df.groupby(group_cols)["평균단가(원)"].shift(1)
df["평균단가(원)_lag2"] = df.groupby(group_cols)["평균단가(원)"].shift(2)
df["평균단가(원)_ma3"] = (
    df.groupby(group_cols)["평균단가(원)"]
    .shift(1)
    .rolling(window=3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
df["평균단가(원)_ma5"] = (
    df.groupby(group_cols)["평균단가(원)"]
    .shift(1)
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

In [None]:
target_col = "평균단가(원)"
lag_features = [col for col in df.columns if "_t-" in col]
derived_features = ["holiday_flag", "holiday_score", "grow_score"]
categorical_cols = ["품종코드"]
numeric_features = ["총거래량(kg)", "week_sin", "week_cos"]
price_features = ["평균단가(원)_lag1", "평균단가(원)_lag2", "평균단가(원)_ma3", "평균단가(원)_ma5"]

used_features = (
    numeric_features + lag_features + derived_features +
    categorical_cols + price_features + ["직팜산지코드", "year", "week", "week_start"]
)

df_model = df.dropna(subset=[target_col] + used_features).copy()
df_model = df_model[df_model["year"] <= 2024].copy()

In [None]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
df_model["직팜산지코드_te"] = np.nan

for train_idx, val_idx in kf.split(df_model):
    train_fold = df_model.iloc[train_idx]
    val_fold = df_model.iloc[val_idx]
    means = train_fold.groupby("직팜산지코드")[target_col].mean()
    df_model.iloc[val_idx, df_model.columns.get_loc("직팜산지코드_te")] = val_fold["직팜산지코드"].map(means)

global_mean = df_model[target_col].mean()
df_model["직팜산지코드_te"] = df_model["직팜산지코드_te"].fillna(global_mean)

df_model = pd.get_dummies(df_model, columns=["품종코드"])

y = df_model[target_col]
X = df_model.drop(columns=["평균단가(원)", "year", "week", "week_start", "직팜산지코드"], errors="ignore")
X = X.select_dtypes(include=[np.number]).astype(np.float32)

xgb = XGBRegressor(objective="reg:squarederror", random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

xgb = XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    "learning_rate": [0.05, 0.1, 0.2],
    "max_depth": [6, 8],
    "n_estimators": [100, 200],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=1,
    n_jobs=-1  # 모든 CPU 코어 사용
)

grid_search.fit(X, y)

print("📌 최적 하이퍼파라미터:")
print(grid_search.best_params_)

In [None]:
best_model = XGBRegressor(
    objective="reg:squarederror",
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    subsample=1.0,
    colsample_bytree=0.8,
    random_state=42
)
best_model.fit(X, y)

In [None]:
y_pred = model.predict(X)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

test_df = df[df["year"] == 2025].dropna(subset=used_features).copy()
y_test = test_df["평균단가(원)"]

X_test = test_df.drop(columns=["평균단가(원)", "year", "week", "week_start", "직팜산지코드"], errors="ignore")
X_test = pd.get_dummies(X_test, columns=["품종코드"])

model_cols = best_model.feature_names_in_
missing_cols = set(model_cols) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[model_cols]  # 컬럼 순서 일치

y_pred = best_model.predict(X_test)

plt.rcParams["font.family"] = "Malgun Gothic"  # 한글 폰트
plt.rcParams["axes.unicode_minus"] = False

test_df = test_df.copy()
test_df["예측값"] = y_pred
test_df["실제값"] = y_test.values

recent_weeks = (
    test_df[["year", "week", "week_start"]]
    .drop_duplicates()
    .sort_values("week_start")
    .tail(52)
    .reset_index(drop=True)
)
recent_weeks["연도_주차"] = recent_weeks["year"].astype(str) + "-" + recent_weeks["week"].astype(str).str.zfill(2)

weekly_result = (
    test_df.groupby(["year", "week"])[["실제값", "예측값"]]
    .mean()
    .reset_index()
)
weekly_result["연도_주차"] = weekly_result["year"].astype(str) + "-" + weekly_result["week"].astype(str).str.zfill(2)

plot_df = recent_weeks.merge(weekly_result, on=["year", "week", "연도_주차"], how="left")

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(plot_df["연도_주차"], plot_df["실제값"], label="실제 평균단가", marker="o")
plt.plot(plot_df["연도_주차"], plot_df["예측값"], label="예측 평균단가", marker="x")
plt.title("예측 vs 실제 평균단가 (최근 1년, 주차별)")
plt.xlabel("연도-주차")
plt.ylabel("평균단가(원)")
plt.xticks(rotation=45, fontsize=9)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from xgboost.callback import EarlyStopping

df = pd.read_csv("EDA/무(EDA용)_스케일링만_병합.csv", encoding="cp949")
df = df.drop(columns=["등급코드"], errors="ignore")
df["week_start"] = pd.to_datetime(df["week_start"])
df = df.sort_values("week_start")

df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52)

group_cols = ["직팜산지코드"]
df["총거래량_lag1"] = df.groupby(group_cols)["총거래량(kg)"].shift(1)
df["총거래량_변화율"] = np.where(
    df["총거래량_lag1"] == 0,
    0,
    (df["총거래량(kg)"] - df["총거래량_lag1"]) / df["총거래량_lag1"]
)
df["평균단가_전년동주"] = df.groupby(group_cols)["평균단가(원)"].shift(52)
df["평균단가_전년비"] = (df["평균단가(원)"] - df["평균단가_전년동주"]) / df["평균단가_전년동주"]

target_col = "평균단가(원)"
lag_features = [col for col in df.columns if "_t-" in col]
derived_features = ["holiday_flag", "holiday_score", "grow_score"]
categorical_cols = ["품종코드"]
numeric_features = ["총거래량(kg)", "week_sin", "week_cos"]
change_features = ["총거래량_변화율", "평균단가_전년비"]

used_features = (
    numeric_features + lag_features + derived_features +
    categorical_cols + change_features + ["직팜산지코드", "year", "week", "week_start"]
)

In [None]:
df_model = df.dropna(subset=[target_col] + used_features).copy()
df_model = df_model[df_model["year"] <= 2024].copy()

le = LabelEncoder()
df_model["직팜산지코드_le"] = le.fit_transform(df_model["직팜산지코드"])

df_model = pd.get_dummies(df_model, columns=["품종코드"])

y = df_model[target_col]
X = df_model.drop(columns=["평균단가(원)", "year", "week", "week_start", "직팜산지코드"], errors="ignore")
X = X.select_dtypes(include=[np.number]).astype(np.float32)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [None]:
xgb = XGBRegressor(objective="reg:squarederror", random_state=42)
param_dist = {
    "learning_rate": [0.01, 0.03, 0.05],
    "max_depth": [3, 4, 5],
    "min_child_weight": [1, 3, 5],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "gamma": [0, 1, 5],
    "n_estimators": [100, 150, 200, 300]
}

tscv = TimeSeriesSplit(n_splits=3)
rand_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring="neg_root_mean_squared_error",
    cv=tscv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)
rand_search.fit(X, y)

print("📌 최적 하이퍼파라미터:")
print(rand_search.best_params_)

In [None]:
best_model = XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42,
    **rand_search.best_params_
)

best_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[EarlyStopping(rounds=30)],
    verbose=True
)

y_val_pred = best_model.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_model["직팜산지코드"])  # df_model은 year <= 2024

test_df = df[df["year"] == 2025].copy()
test_regions = set(test_df["직팜산지코드"].unique())
train_regions = set(le.classes_)

unseen_regions = test_regions - train_regions

if unseen_regions:
    print(f"⚠️ 테스트셋에 학습셋에 없던 LabelEncoder 클래스 존재: {unseen_regions}")
else:

In [None]:
n_samples = X.shape[0]

n_features = X.shape[1]

print(f"📦 학습 데이터 수: {n_samples}")
print(f"🔢 피처 수: {n_features}")
print(f"📐 샘플당 피처 비율: {n_samples / n_features:.2f}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

importance = best_model.get_booster().get_score(importance_type="gain")
importance_df = pd.DataFrame({
    "feature": list(importance.keys()),
    "importance": list(importance.values())
})
importance_df = importance_df.sort_values("importance", ascending=False).reset_index(drop=True)

top_n = 20
plt.figure(figsize=(10, 6))
plt.barh(importance_df["feature"][:top_n][::-1], importance_df["importance"][:top_n][::-1])
plt.xlabel("Importance (Gain)")
plt.title(f"Top {top_n} Feature Importance (by Gain)")
plt.tight_layout()
plt.grid(True, axis="x", linestyle="--", alpha=0.5)
plt.show()

In [None]:
importance_dict = best_model.get_booster().get_score(importance_type="gain")

target_feature = "총거래량_변화율"
importance_value = importance_dict.get(target_feature, 0)

print(f"📌 '총거래량_변화율' 중요도 (Gain): {importance_value:.4f}")

In [None]:
import pandas as pd

importance_dict = best_model.get_booster().get_score(importance_type="gain")
importance_df = pd.DataFrame({
    "feature": list(importance_dict.keys()),
    "importance": list(importance_dict.values())
}).sort_values(by="importance", ascending=False).reset_index(drop=True)

importance_df["rank"] = importance_df.index + 1

target_feature = "총거래량_변화율"
target_row = importance_df[importance_df["feature"] == target_feature]

if not target_row.empty:
    rank = int(target_row["rank"].values[0])
    gain = float(target_row["importance"].values[0])
else:
    print("❌ '총거래량_변화율'은 변수 중요도 목록에 없습니다.")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

test_df = df[df["year"] == 2025].copy()

y_test = test_df["평균단가(원)"].copy()
test_df = test_df.drop(columns=["평균단가(원)"])

known_codes = set(le.classes_)
test_df = test_df[test_df["직팜산지코드"].isin(known_codes)].copy()
y_test = y_test.loc[test_df.index]  # y_test도 동일하게 index 맞추기
test_df["직팜산지코드_le"] = le.transform(test_df["직팜산지코드"])

test_df = pd.get_dummies(test_df, columns=["품종코드"])

missing_cols = set(df_model.columns) - set(test_df.columns)
missing_cols = [col for col in missing_cols if col.startswith("품종코드_")]
for col in missing_cols:
    test_df[col] = 0

X_test = test_df.drop(columns=["year", "week", "week_start", "직팜산지코드"], errors="ignore")
X_test = X_test.select_dtypes(include=[np.number])

X_test = X_test[best_model.feature_names_in_].astype(np.float32)

y_pred = best_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

plot_df = test_df.copy()
plot_df = plot_df.reset_index(drop=True)  # 인덱스 정렬
plot_df["예측값"] = y_pred
plot_df["실제값"] = y_test.reset_index(drop=True)

weekly_avg = (
    plot_df.groupby(["year", "week"])[["실제값", "예측값"]]
    .mean()
    .reset_index()
)
weekly_avg["연도_주차"] = (
    weekly_avg["year"].astype(str) + "-" + weekly_avg["week"].astype(str).str.zfill(2)
)

plt.figure(figsize=(16, 6))
plt.plot(weekly_avg["연도_주차"], weekly_avg["실제값"], label="실제 평균단가", marker="o", linewidth=2)
plt.plot(weekly_avg["연도_주차"], weekly_avg["예측값"], label="예측 평균단가", marker="x", linewidth=2)
plt.title("예측 vs 실제 평균단가 (주차별 평균)")
plt.xlabel("연도-주차")
plt.ylabel("평균단가(원)")
plt.xticks(rotation=45, fontsize=9)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

part1_df = test_df.copy()
part1_df["예측값"] = y_pred
part1_df["실제값"] = y_test.values
part1_df["연도_주차"] = part1_df["year"].astype(str) + "-" + part1_df["week"].astype(str).str.zfill(2)

part1_avg = (
    part1_df.groupby(["year", "week", "연도_주차"])[["실제값", "예측값"]]
    .mean()
    .reset_index()
)

last_pred = part1_avg["예측값"].iloc[-1]
weeks_future = pd.DataFrame({
    "year": [2025]*30,
    "week": list(range(23, 53)),
})
weeks_future["연도_주차"] = weeks_future["year"].astype(str) + "-" + weeks_future["week"].astype(str).str.zfill(2)
weeks_future["예측값"] = last_pred
weeks_future["실제값"] = np.nan  # 실제값 없음

plot_df = pd.concat([part1_avg, weeks_future], ignore_index=True)

plt.figure(figsize=(16, 6))
plt.plot(plot_df["연도_주차"], plot_df["예측값"], label="예측 평균단가", marker="o", linewidth=2)
plt.plot(plot_df["연도_주차"], plot_df["실제값"], label="실제 평균단가", marker="x", linewidth=2)
plt.title("2025년 평균단가 (1~22주: 실제+예측 / 23~52주: 정보 없음, 예측 연장)")
plt.xlabel("연도-주차")
plt.ylabel("평균단가(원)")
plt.xticks(rotation=45, fontsize=9)
plt.grid(True, linestyle="--", alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()