In [1]:
import pandas as pd
import statsmodels.api as sm # 회귀모델
import joblib # pkl이나 joblib로 모델 저장, load

In [2]:
df = pd.read_csv('../data/양파요약데이터_직팜정리.csv', comment='#', encoding='cp949')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028241 entries, 0 to 1028240
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   날짜(YYYY-MM-DD)  1028241 non-null  object 
 1   상품 중분류 이름       1028241 non-null  object 
 2   등급이름            1028241 non-null  object 
 3   총가격(원)          1028241 non-null  float64
 4   단위총물량(kg)       1028241 non-null  float64
 5   산지코드            1028241 non-null  int64  
 6   산지이름            1027412 non-null  object 
 7   직팜산지코드          1027412 non-null  float64
 8   직팜산지이름          1027412 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 70.6+ MB


In [3]:
df.sample()

Unnamed: 0,날짜(YYYY-MM-DD),상품 중분류 이름,등급이름,총가격(원),단위총물량(kg),산지코드,산지이름,직팜산지코드,직팜산지이름
520061,2021-07-06,양파,5등,578000.0,1020.0,526000,전라남도 영암군,1091.0,전라남도 영암군


In [5]:
df['날짜(YYYY-MM-DD)'] = pd.to_datetime(df['날짜(YYYY-MM-DD)'])

df['연도'] = df['날짜(YYYY-MM-DD)'].dt.year
df['월'] = df['날짜(YYYY-MM-DD)'].dt.month
df['일'] = df['날짜(YYYY-MM-DD)'].dt.day

df[['날짜(YYYY-MM-DD)', '연도', '월', '일']].head()

Unnamed: 0,날짜(YYYY-MM-DD),연도,월,일
0,2018-01-03,2018,1,3
1,2018-01-03,2018,1,3
2,2018-01-03,2018,1,3
3,2018-01-03,2018,1,3
4,2018-01-03,2018,1,3


In [6]:
print(df['등급이름'].unique())

df_encoded = pd.get_dummies(df, columns=['등급이름'])

df_encoded.head()

['특' '상' '등외' '보통' '5등' '6등' '7등' '8등' '4등' '무등급']


Unnamed: 0,날짜(YYYY-MM-DD),상품 중분류 이름,총가격(원),단위총물량(kg),산지코드,산지이름,직팜산지코드,직팜산지이름,연도,월,...,등급이름_4등,등급이름_5등,등급이름_6등,등급이름_7등,등급이름_8등,등급이름_등외,등급이름_무등급,등급이름_보통,등급이름_상,등급이름_특
0,2018-01-03,양파,4893000.0,4200.0,676000,경상남도 함양군,1139.0,경상남도 함양군,2018,1,...,0,0,0,0,0,0,0,0,0,1
1,2018-01-03,양파,864000.0,980.0,630850,경남 마산시 내서읍,1125.0,경상남도 마산시,2018,1,...,0,0,0,0,0,0,0,0,1,0
2,2018-01-03,양파,2633500.0,2420.0,635943,경남 창녕군 대합면 모전리,1126.0,경상남도 창녕군,2018,1,...,0,0,0,0,0,0,0,0,0,1
3,2018-01-03,양파,4187500.0,4740.0,635940,경남 창녕군 대합면,1126.0,경상남도 창녕군,2018,1,...,0,0,0,0,0,0,0,0,0,1
4,2018-01-03,양파,3168000.0,3580.0,630850,경남 마산시 내서읍,1125.0,경상남도 마산시,2018,1,...,0,0,0,0,0,0,0,0,0,1


In [7]:
y = df_encoded['총가격(원)']

X = df_encoded[['연도', '월', '일', '단위총물량(kg)'] + [col for col in df_encoded.columns if '등급이름_' in col]]

X.head()

Unnamed: 0,연도,월,일,단위총물량(kg),등급이름_4등,등급이름_5등,등급이름_6등,등급이름_7등,등급이름_8등,등급이름_등외,등급이름_무등급,등급이름_보통,등급이름_상,등급이름_특
0,2018,1,3,4200.0,0,0,0,0,0,0,0,0,0,1
1,2018,1,3,980.0,0,0,0,0,0,0,0,0,1,0
2,2018,1,3,2420.0,0,0,0,0,0,0,0,0,0,1
3,2018,1,3,4740.0,0,0,0,0,0,0,0,0,0,1
4,2018,1,3,3580.0,0,0,0,0,0,0,0,0,0,1


In [8]:
from sklearn.linear_model import LinearRegression
import joblib

model = LinearRegression()
model.fit(X, y)

print("회귀모델 학습 완료!")

회귀모델 학습 완료!


In [10]:
joblib.dump(model, "../model/onion_price_regression.joblib")

print("모델이 저장완료: ../model/onion_price_regression.joblib")

모델이 저장완료: ../model/onion_price_regression.joblib


In [16]:
sample = X.iloc[[500000]]
predicted_total_price = model.predict(sample)[0]

weight_kg = sample['단위총물량(kg)'].values[0]

unit_price = predicted_total_price / weight_kg

print(f"예측 총가격: {int(predicted_total_price):,} 원")
print(f"예측 1kg당 가격: {int(unit_price):,} 원")

예측 총가격: 242,104 원
예측 1kg당 가격: 80,701 원
