In [1]:
import pandas as pd
# 학습 / 검증 데이터 분할 
from sklearn.model_selection import train_test_split
# 파이프라인 
from imblearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
#결측치 처리 
from sklearn.impute import SimpleImputer 
# 인코딩 / 스케일링 
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
# 교차검증 
from sklearn.model_selection import GridSearchCV
# 학습 알고리즘
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
#from catboost import CatBoostRegressor
#학습된 모델 관리
import pickle 

import matplotlib

In [2]:
# 맑은 고딕 적용
matplotlib.rc("font", family="Malgun Gothic")
# 음수 표지
matplotlib.rc("axes", unicode_minus=False)

# 데이터 확인

In [5]:
made=pd.read_csv('./A3_홈쇼핑/made_final.csv',encoding='euc-kr')
health=pd.read_csv('./A3_홈쇼핑/health_final.csv',encoding='euc-kr')
fresh_water=pd.read_csv('./A3_홈쇼핑/fresh_water_final.csv',encoding='euc-kr')
clothes=pd.read_csv('./A3_홈쇼핑/cloth_final.csv',encoding='euc-kr')
fresh_farm=pd.read_csv('./A3_홈쇼핑/fresh_farm_final.csv',encoding='euc-kr')

# 데이터 전처리

In [39]:
#데이터 불러오기
df=made
#변수 타입 변경
df['hour']=df['hour'].astype(str)
df['month']=df['month'].astype(str)
#목표변수 설명변수 설정
X = df[['TF','판매단가','hour','단위시간영업효율','month','crew_score']]
Y = df['실수량']
#train/test 데이터 분리
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=1234)
#연속형,범주형 변수의 전처리 설정
numeric_pipe = make_pipeline( SimpleImputer(), RobustScaler() )
category_pipe = make_pipeline( SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore') )
#연속형,범주형 변수 분할
numeric_list = X.describe().columns.tolist()
category_list = X.describe(include='object').columns.tolist()
#파이프라인 모델 구현
preprocessing_pipe = make_column_transformer( (numeric_pipe, numeric_list),(category_pipe, category_list))


# 모델구현

In [42]:
pipe_model = make_pipeline(preprocessing_pipe, RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0005,
                      min_samples_leaf=5,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=-1, oob_score=False,
                      random_state=1, verbose=0, warm_start=False))
pipe_model.fit(X_train, Y_train)

  warn(


In [43]:
# Train 데이터 설명력
print("Score on training seg: {:.3f}".format(pipe_model.score(X_train, Y_train)))
# Test 데이터 설명력
print("Score on test seg: {:.3f}".format(pipe_model.score(X_test, Y_test)))

Score on training seg: 0.409
Score on test seg: 0.439


In [44]:
#학습된 모델 파일 저장
pickle.dump(pipe_model, open('model_생활용품_v5.sav','wb'))

# 파라미터 세팅

In [10]:
#파라미터 값 세팅(lgbm)
para_lr = [lr * 0.2 for lr in range(1,6)]
para_depth = [depth *2 for depth in range(1, 6)]
para_es = [n_tree * 30 for n_tree in range(1, 6)]
para_bytree = [n_tree * 0.2 for n_tree in range(1, 6)]
para_sample = [n_tree * 0.2 for n_tree in range(1, 6)]

In [8]:
#파라미터 값 세팅(gbr)
para_lr = [lr * 0.2 for lr in range(1,6)]
para_depth = [depth *2 for depth in range(1, 6)]
para_n_tree = [n_tree * 30 for n_tree in range(1, 6)]
para_split = [split * 5 for split in range(1, 6)]
para_leaf = [leaf * 5 for leaf in range(1, 6)]

In [3]:
#파라미터 값 세팅(rf)
para_depth = [depth *2 for depth in range(1, 6)]
para_n_tree = [n_tree * 30 for n_tree in range(1, 6)]
para_split = [split * 5 for split in range(1, 6)]
para_leaf = [leaf * 5 for leaf in range(1, 6)]

In [26]:
#파라미터 세팅(catboost)
para_depth = [depth *2 for depth in range(1, 6)]
para_itr = [itr * 50 for itr in range(1, 6)]
para_lr = [lr * 0.05 for lr in range(1, 6)]
para_leaf = [leaf * 0.5 for leaf in range(1, 6)]

# 그리드 서치

In [9]:
#그리드 서치
#params={
#    'randomforestregressor__n_estimators':para_n_tree,
#    'randomforestregressor__max_depth':para_depth,
#    'randomforestregressor__min_samples_split':para_split,
#    'randomforestregressor__min_samples_leaf':para_leaf
#}
params={
    'gradientboostingregressor__learning_rate':para_lr,
    'gradientboostingregressor__n_estimators':para_n_tree,
    'gradientboostingregressor__max_depth':para_depth,
    'gradientboostingregressor__min_samples_split':para_split,
    'gradientboostingregressor__min_samples_leaf':para_leaf
}
#params={
#    'catboostregressor__iterations': para_itr,
#    'catboostregressor__learning_rate': para_lr,
#    'catboostregressor__depth': para_depth,
#    'catboostregressor__l2_leaf_reg':para_leaf
#}
grid_model = GridSearchCV(pipe_model, params, n_jobs=-1 , cv=3)
grid_model.fit(X_train, Y_train)

In [10]:
#그리드 서치 결과 확인
print("best estimator model : \n{}".format(grid_model.best_estimator_))
print("\nbest parameter: \n{}".format(grid_model.best_params_))
print("\nbest score: \n{}".format(grid_model.best_score_.round(3)))

best estimator model : 
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('robustscaler',
                                                                   RobustScaler())]),
                                                  ['판매단가', '단위시간영업효율']),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
            

# 학습파일 저장

In [11]:
#학습된 모델 파일 저장
best_model = grid_model.best_estimator_
pickle.dump(best_model, open('model_의류_v4.sav','wb'))