# SE_Project

## 과거 농산물 데이터를 분석해 미래 농산물의 가격을 예측하는 사이트 제작하기

In [1]:
# 기본 라이브러리 추가

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [57]:
# 2021년 ~ 2024년 농산물 가격 데이터 가져오기

df = pd.read_csv("collected_prices_2025_to_2025_all_items_markets_cumulative.csv")

In [58]:
df.head()

Unnamed: 0,SALEDATE,WHSALNAME,CMPNAME,LARGENAME_API,MIDNAME_API,SMALLNAME,AVGAMT,STD,TOTQTY,MY_TARGET_ITEM_NAME,MY_TARGET_LARGENAME,MY_TARGET_MIDNAME,WHSALCD
0,20250103,천안,천안청과㈜,엽경채류,배추,저장배추,14250.0,12kg 상자,8916.0,배추,엽경채류,배추,340101
1,20250103,천안,천안청과㈜,엽경채류,배추,저장배추,8600.0,12kg 상자,2760.0,배추,엽경채류,배추,340101
2,20250103,천안,천안농협(공),엽경채류,배추,기타배추,16614.286,1kg 상자,49.0,배추,엽경채류,배추,340101
3,20250103,천안,천안농협(공),엽경채류,배추,우거지,4000.0,1kg 상자,2.0,배추,엽경채류,배추,340101
4,20250103,천안,천안농협(공),엽경채류,배추,쌈배추,14000.0,8kg 상자,24.0,배추,엽경채류,배추,340101


In [59]:
df.shape

(69700, 13)

In [60]:
# STD로부터 각 항목의 무게 데이터를 추출하는 함수 선언

def extract_weight_from_STD(std_string):
    if pd.isna(std_string):
        return np.nan
    std_string = str(std_string).lower()

    kg_match = re.search(r'(\d+\.?\d*)\s*kg', std_string)
    if kg_match:
        return float(kg_match.group(1))

    g_match = re.search(r'(\d+\.?\d)\s*g', std_string)
    if g_match:
        return float(g_match.group(1)) / 1000
    return np.nan

df['STD_kg'] = df['STD'].apply(extract_weight_from_STD)

In [61]:
df['STD_kg']

0        12.0
1        12.0
2         1.0
3         1.0
4         8.0
         ... 
69695     2.0
69696     2.0
69697     2.0
69698     2.0
69699     2.0
Name: STD_kg, Length: 69700, dtype: float64

In [62]:
df['PRICE_PER_KG'] = df['AVGAMT'] / df['STD_kg']

df['PRICE_PER_KG'].replace([np.inf, -np.inf], np.nan, inplace = True)

In [63]:
df['PRICE_PER_KG']

0         1187.500000
1          716.666667
2        16614.286000
3         4000.000000
4         1750.000000
             ...     
69695     7912.500000
69696     6646.500000
69697    12396.250000
69698     4000.000000
69699     6543.750000
Name: PRICE_PER_KG, Length: 69700, dtype: float64

In [64]:
columns_to_drop = ['WHSALNAME', 'CMPNAME', 'WHSALCD', 'MY_TARGET_ITEM_NAME', 'MY_TARGET_LARGENAME', 'MY_TARGET_MIDNAME']

In [65]:
df.drop(columns = columns_to_drop, axis = 1, inplace = True, errors = 'ignore')

In [66]:
df.head()

Unnamed: 0,SALEDATE,LARGENAME_API,MIDNAME_API,SMALLNAME,AVGAMT,STD,TOTQTY,STD_kg,PRICE_PER_KG
0,20250103,엽경채류,배추,저장배추,14250.0,12kg 상자,8916.0,12.0,1187.5
1,20250103,엽경채류,배추,저장배추,8600.0,12kg 상자,2760.0,12.0,716.666667
2,20250103,엽경채류,배추,기타배추,16614.286,1kg 상자,49.0,1.0,16614.286
3,20250103,엽경채류,배추,우거지,4000.0,1kg 상자,2.0,1.0,4000.0
4,20250103,엽경채류,배추,쌈배추,14000.0,8kg 상자,24.0,8.0,1750.0


In [67]:
columns_for_grouping = ['SALEDATE', 'MIDNAME_API', 'PRICE_PER_KG']

df_cleaned = df.dropna(subset = columns_for_grouping)

df_cleaned = df_cleaned.sort_values(by = 'SALEDATE')

grouping_keys = ['SALEDATE', 'MIDNAME_API']

df_daily_avg_price = df_cleaned.groupby(grouping_keys)['PRICE_PER_KG'].mean().reset_index()

df_daily_avg_price.head()

Unnamed: 0,SALEDATE,MIDNAME_API,PRICE_PER_KG
0,20250102,깻잎,14781.402012
1,20250102,대파,1580.947562
2,20250102,마늘,5843.775
3,20250102,무,1217.558405
4,20250102,배추,4832.750162


In [68]:
df_daily_avg_price.tail()

Unnamed: 0,SALEDATE,MIDNAME_API,PRICE_PER_KG
798,20250520,마늘,4813.425323
799,20250520,무,1104.042636
800,20250520,배추,838.824857
801,20250520,양파,880.418389
802,20250520,홍고추,5281.225032


In [69]:
pivot_column_name = 'MIDNAME_API'

try:
    dataframe = df_cleaned.pivot_table(index = 'SALEDATE',
                                      columns = pivot_column_name,
                                      values = 'PRICE_PER_KG',
                                      aggfunc = 'mean')
except KeyError as e:
    print(f"Check Column's name")
except Exception as e:
    print(f"Other error happened")

In [70]:
dataframe.columns.name = None
dataframe = dataframe.reset_index()

In [71]:
df = dataframe.round(2)

In [72]:
df

Unnamed: 0,SALEDATE,깻잎,대파,마늘,무,배추,양파,홍고추
0,20250102,14781.40,1580.95,5843.78,1217.56,4832.75,1408.03,12188.33
1,20250103,11304.91,1683.65,4613.04,2309.07,3110.80,1281.47,12823.14
2,20250104,11444.51,1567.71,4664.59,2308.51,2466.66,1355.17,11395.75
3,20250106,12001.80,1659.86,4721.10,1671.05,3728.57,1308.74,10287.87
4,20250107,10084.76,1837.39,5034.74,1926.03,2640.44,1430.88,8227.55
...,...,...,...,...,...,...,...,...
110,20250515,7504.12,797.14,3806.53,1421.56,1948.11,829.99,8056.35
111,20250516,7484.80,805.38,4228.17,1936.55,2310.72,839.24,5142.38
112,20250517,6750.44,811.21,4385.40,992.69,10743.00,952.11,6541.44
113,20250519,7030.72,1089.97,5659.89,1060.81,7946.91,903.22,4668.99


In [73]:
column_order = [
    'SALEDATE',
    '배추',
    '무',
    '마늘',
    '양파',
    '대파',
    '홍고추',
    '깻잎'
]

df = df[column_order]

In [74]:
df

Unnamed: 0,SALEDATE,배추,무,마늘,양파,대파,홍고추,깻잎
0,20250102,4832.75,1217.56,5843.78,1408.03,1580.95,12188.33,14781.40
1,20250103,3110.80,2309.07,4613.04,1281.47,1683.65,12823.14,11304.91
2,20250104,2466.66,2308.51,4664.59,1355.17,1567.71,11395.75,11444.51
3,20250106,3728.57,1671.05,4721.10,1308.74,1659.86,10287.87,12001.80
4,20250107,2640.44,1926.03,5034.74,1430.88,1837.39,8227.55,10084.76
...,...,...,...,...,...,...,...,...
110,20250515,1948.11,1421.56,3806.53,829.99,797.14,8056.35,7504.12
111,20250516,2310.72,1936.55,4228.17,839.24,805.38,5142.38,7484.80
112,20250517,10743.00,992.69,4385.40,952.11,811.21,6541.44,6750.44
113,20250519,7946.91,1060.81,5659.89,903.22,1089.97,4668.99,7030.72


In [75]:
kor_to_eng = {
    'SALEDATE': 'date',
    '배추': 'cabbage',
    '무': 'radish',
    '마늘': 'garlic',
    '양파': 'onion',
    '대파': 'daikon',
    '홍고추': 'cilantro',
    '깻잎': 'artichoke'
}

df.columns = df.columns.map(kor_to_eng)

In [76]:
df['date'] = df['date'].astype(str)

In [77]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

In [78]:
df

Unnamed: 0,date,cabbage,radish,garlic,onion,daikon,cilantro,artichoke
0,2025-01-02,4832.75,1217.56,5843.78,1408.03,1580.95,12188.33,14781.40
1,2025-01-03,3110.80,2309.07,4613.04,1281.47,1683.65,12823.14,11304.91
2,2025-01-04,2466.66,2308.51,4664.59,1355.17,1567.71,11395.75,11444.51
3,2025-01-06,3728.57,1671.05,4721.10,1308.74,1659.86,10287.87,12001.80
4,2025-01-07,2640.44,1926.03,5034.74,1430.88,1837.39,8227.55,10084.76
...,...,...,...,...,...,...,...,...
110,2025-05-15,1948.11,1421.56,3806.53,829.99,797.14,8056.35,7504.12
111,2025-05-16,2310.72,1936.55,4228.17,839.24,805.38,5142.38,7484.80
112,2025-05-17,10743.00,992.69,4385.40,952.11,811.21,6541.44,6750.44
113,2025-05-19,7946.91,1060.81,5659.89,903.22,1089.97,4668.99,7030.72


In [79]:
df_without_may = df[df['date'].dt.month != 5]
df_without_may.tail()

Unnamed: 0,date,cabbage,radish,garlic,onion,daikon,cilantro,artichoke
93,2025-04-25,3760.42,2004.59,4116.21,1249.71,670.54,6485.02,5548.51
94,2025-04-26,2156.31,2470.71,3970.21,1284.41,668.11,6097.55,6511.69
95,2025-04-28,32127.52,1485.51,4212.23,1119.54,689.1,5698.7,5424.83
96,2025-04-29,4154.87,1302.11,4651.4,1150.63,767.71,6454.0,6009.32
97,2025-04-30,4740.45,1109.83,4438.43,1149.61,780.9,5330.21,5582.41


In [81]:
df.to_csv('test_set.csv')