# 1. dataset 

In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import optuna
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
import statsmodels.api as sm 
from time import time

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [None]:
# 한글 폰트 경로 설정
font_path = '/System/Library/Fonts/AppleSDGothicNeo.ttc'
font_name = font_manager.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

In [25]:
data = pd.read_csv('combined_data_day.csv', encoding = "cp949")
data.head()
print(data.shape) #352

(352, 12)


In [26]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   reg_date       352 non-null    object 
 1   holiday_yn     352 non-null    object 
 2   rider_cnt      352 non-null    int64  
 3   day_of_reg     352 non-null    object 
 4   rain_c         352 non-null    float64
 5   snow_c         352 non-null    float64
 6   is_rain        352 non-null    int64  
 7   rider_cnt_w_1  352 non-null    int64  
 8   rider_cnt_w_2  352 non-null    int64  
 9   rider_cnt_w_3  352 non-null    int64  
 10  rider_cnt_w_4  352 non-null    int64  
 11  order_cnt_w_1  352 non-null    int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 33.1+ KB
None
          rider_cnt      rain_c      snow_c     is_rain  rider_cnt_w_1   
count    352.000000  352.000000  352.000000  352.000000     352.000000  \
mean   16314.150568    5.401136    0.124432    0.312500   16307

In [27]:
data["reg_date"] = pd.to_datetime(data["reg_date"])
data = data.sort_values(by="reg_date")

In [28]:
data = data.drop(columns = ['rain_c','snow_c'])
# print(data.head())

In [29]:
#data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
#print(data.shape) 

reg_date         0
holiday_yn       0
rider_cnt        0
day_of_reg       0
is_rain          0
rider_cnt_w_1    0
rider_cnt_w_2    0
rider_cnt_w_3    0
rider_cnt_w_4    0
order_cnt_w_1    0
dtype: int64

In [30]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in [ 'day_of_reg', 'is_rain','holiday_yn' ] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


reg_date         datetime64[ns]
holiday_yn             category
rider_cnt                 int64
day_of_reg             category
is_rain                category
rider_cnt_w_1             int64
rider_cnt_w_2             int64
rider_cnt_w_3             int64
rider_cnt_w_4             int64
order_cnt_w_1             int64
dtype: object


# 2. 데이터 전처리

In [None]:
# numeric 변수 scale 
# scaler = StandardScaler()  #평균 0 , 분산 1로 조정
# #scaler = MinMaxScaler()

# num_vars = ['rider_cnt_2', 'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#             'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#             'order_cnt_w_4']
# data[num_vars] = scaler.fit_transform(data[num_vars])

# print(df.head(3))

## 2-1. one-hot-encoding

In [None]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [31]:
#df = data.drop(columns = ['month'])
df = data
var = [ 'day_of_reg', 'is_rain','holiday_yn' ]
encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, df.drop(columns=var)], axis=1)

print(df.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'holiday_yn_N', 'holiday_yn_Y', 'reg_date', 'rider_cnt',
       'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3', 'rider_cnt_w_4',
       'order_cnt_w_1'],
      dtype='object')


# 3. train, test set split

In [32]:
# train_ratio = 0.8
# total_samples = df.shape[0]
# train_samples = int(train_ratio * total_samples)
# df_train = df[:train_samples]
# df_test = df[train_samples:]

df_train = df[df["reg_date"]<= '2023-03-31']
df_test = df[df["reg_date"] >= '2023-04-01']

# print(df_train['reg_date'].min()) #2022-01-29
# print(df_test['reg_date'].min()) #2023-02-15

# print(df_train['reg_date'].max()) #2023-02-15
# print(df_test['reg_date'].max()) #2023-05-21

df_train = df_train.drop(columns = ['reg_date'])
df_test = df_test.drop(columns = ['reg_date'])
print(df_train.shape, df_test.shape) # 289, 67


(284, 17) (68, 17)


In [33]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt'])
y_train = df_train['rider_cnt']

X_test = df_test.drop(columns=['rider_cnt'])
y_test = df_test['rider_cnt']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(284, 16) (284,) (68, 16) (68,)


In [34]:
print(X_train.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'holiday_yn_N', 'holiday_yn_Y', 'rider_cnt_w_1',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1'],
      dtype='object')


### numeric_scale 

In [None]:
# # 입력 변수 
# numeric_cols = ['rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#                 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#                 'order_cnt_w_4']

# # scaler 
# scaler_X = StandardScaler()

# # X_train, X_test
# X_train_scaled = scaler_X.fit_transform(X_train[numeric_cols])
# X_test_scaled = scaler_X.transform(X_test[numeric_cols])

# # 스케일링된 결과를 DataFrame으로 변환
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index = X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=numeric_cols, index = X_test.index)

# # 원래의 범주형 변수들을 선택
# categorical_cols = [col for col in X_train.columns if col not in numeric_cols]
# X_train_cat = X_train[categorical_cols]
# X_test_cat = X_test[categorical_cols]

# # 스케일링된 DataFrame과 범주형 변수들을 병합
# X_train_final = pd.concat([X_train_scaled, X_train_cat], axis=1)
# X_test_final = pd.concat([X_test_scaled, X_test_cat], axis=1)



In [None]:
# 예측값
# y_train, y_test
# scaler_y = StandardScaler()
# y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
# y_train_scaled = y_train_scaled.ravel()
# y_test_scaled=  y_test_scaled.ravel()

# print(y_train_scaled.shape)
# print(y_test_scaled.shape)

# 3. regression - benchmark model

In [35]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

                            OLS Regression Results                            
Dep. Variable:              rider_cnt   R-squared:                       0.416
Model:                            OLS   Adj. R-squared:                  0.388
Method:                 Least Squares   F-statistic:                     14.79
Date:                Thu, 08 Jun 2023   Prob (F-statistic):           5.29e-25
Time:                        11:22:56   Log-Likelihood:                -2316.6
No. Observations:                 284   AIC:                             4661.
Df Residuals:                     270   BIC:                             4712.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           8977.6301   1004.834      8.

In [None]:
# 선형 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 잔차 계산
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# 변수별 잔차 그래프 그리기
for column in X_test.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(x=X_test[column], y=residuals)
    plt.title(f'Residuals vs. {column}')
    plt.xlabel(column)
    plt.ylabel('Residuals')
    plt.show()




# 4.Machine Learning Modeling

## 4-1. 하이퍼파라미터 튜닝 - Grid Search 

### a. LightGBM model 

In [None]:
# classifier = LGBMRegressor()

# parameters = [{'learning_rate': [0.1, 0.05, 0.01, 0.005], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
#               {'learning_rate': [0.15, 0.125, 0.1, 0.075], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'num_leaves': [16, 32, 64]}]

# grid_search = GridSearchCV(estimator=classifier,
#                            param_grid=parameters,
#                            scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'],
#                            cv=10,
#                            n_jobs=-1,
#                            refit='neg_mean_squared_error')                     

# grid_search.fit(X_train, y_train)
# best_rmse = np.sqrt(-1 * grid_search.cv_results_['mean_test_neg_mean_squared_error'][grid_search.best_index_])
# best_mae = -1 * grid_search.cv_results_['mean_test_neg_mean_absolute_error'][grid_search.best_index_]
# best_parameters = grid_search.best_params_
# print("Best RMSE: {:.2f}".format(best_rmse))
# print("Best MAE: {:.2f}".format(best_mae))
# print("Best Parameters:", best_parameters)


In [None]:
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "num_leaves": trial.suggest_int("num_leaves", 16, 64),
    }

    model = LGBMRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = np.sqrt(-1 * np.mean(score))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE: {:.2f}".format(study.best_value))
print("Best Parameters:", study.best_params)

#Best RMSE: 29.84
#Best Parameters:  {'learning_rate': 0.02546270038894073, 'n_estimators': 76, 'max_depth': 5, 'num_leaves': 56}

### b. ridge regression

In [None]:
# Ridge Regression
ridge = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 2.0, 5.0, 10.0]}
ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
print("Ridge Best RMSE: {:.2f}".format(np.sqrt(-ridge_grid_search.best_score_)))
print("Ridge Best Parameters: ", ridge_grid_search.best_params_)

# Ridge Best RMSE: 27.83
# Ridge Best Parameters:  {'alpha': 10.0}

### c. Lasso regression

In [None]:
# Lasso Regression
lasso = Lasso(max_iter = 10000)
lasso_param_grid = {'alpha': [0.1, 1.0,2.0, 5.0, 10.0]}
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=lasso_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
print("Lasso Best RMSE: {:.2f}".format(np.sqrt(-lasso_grid_search.best_score_)))
print("Lasso Best Parameters: ", lasso_grid_search.best_params_)

# Lasso Best RMSE: 28.83
# Lasso Best Parameters:  {'alpha': 0.1}

### d. Support vector regressor

In [None]:
# SVR
# svr = SVR()
# svr_param_grid = {'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'C': [0.1, 1.0, 10.0], 'gamma': ['scale', 'auto']}
# svr_grid_search = GridSearchCV(estimator=svr, param_grid=svr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
# svr_grid_search.fit(df_X, df_y)
# print("SVR Best RMSE: {:.2f}".format(np.sqrt(-svr_grid_search.best_score_)))
# print("SVR Best Parameters: ", svr_grid_search.best_params_)

### e. Random Forest Regressor

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        'min_samples_split': [2, 5, 10],
    }

    model = RandomForestRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = np.sqrt(-1 * np.mean(score))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE: {:.2f}".format(study.best_value))
print("Best Parameters:", study.best_params)


In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=0)
rfr_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 5, 10]}
rfr_grid_search = GridSearchCV(estimator=rfr, param_grid=rfr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
rfr_grid_search.fit(X_train, y_train)
print("Random Forest Regressor Best RMSE: {:.2f}".format(np.sqrt(-rfr_grid_search.best_score_)))
print("Random Forest Regressor Best Parameters: ", rfr_grid_search.best_params_)

# Random Forest Regressor Best RMSE: 28.9
# Random Forest Regressor Best Parameters:  {'max_depth': 9, 'min_samples_split': 2, 'n_estimators': 200}


### f. Decision Tree Regressor

In [None]:
# Decision Tree Regressor
dtr = DecisionTreeRegressor(random_state=0)
dtr_param_grid = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
dtr_grid_search = GridSearchCV(estimator=dtr, param_grid=dtr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
dtr_grid_search.fit(X_train, y_train)
print("Decision Tree Regressor Best RMSE: {:.2f}".format(np.sqrt(-dtr_grid_search.best_score_)))
print("Decision Tree Regressor Best Parameters: ", dtr_grid_search.best_params_)

# Decision Tree Regressor Best RMSE: 27.85
# Decision Tree Regressor Best Parameters:  {'max_depth': 7, 'min_samples_split': 2}

## 4-2. train, test set 적용 

### a. train, test rmse, mae

In [36]:
def MAPE(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) /y_test)) *100

In [38]:
train_set = data[data["reg_date"] <= '2023-03-31']
test_set = data[data["reg_date"]  >= '2023-04-01']

def execute_pipeline(X_train, y_train, X_test, y_test):
    regressors = [
        LinearRegression(),
        Lasso(alpha=0.1, max_iter=5000),
        LGBMRegressor(learning_rate = 0.02546270038894073, n_estimators =  76, max_depth = 5, num_leaves = 56),
        RandomForestRegressor(random_state=0, max_depth=9, min_samples_split=2, n_estimators=200),
    
    ]

    result_test = pd.DataFrame({'reg_date': test_set["reg_date"], 'day_of_reg': test_set["day_of_reg"], 
                                'holiday_yn': test_set["holiday_yn"], 'y_test': test_set["rider_cnt"]})
    
    scores = {}
    predictions = {}

    for reg in regressors:
        reg_name = reg.__class__.__name__
        scoring = {
            'rmse' : 'neg_root_mean_squared_error',
            'mae' : 'neg_mean_absolute_error',
            'r2' : 'r2'
        } 
        
        cv_results = cross_validate(reg, X_train, y_train, cv = 5, scoring = scoring)
        
        cv_rmse = -np.mean(cv_results['test_rmse'])
        cv_rmse_std = np.std(cv_results['test_rmse'])
        
        reg.fit(X_train, y_train)
        y_pred_test = reg.predict(X_test)
        
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_test = mean_absolute_error(y_test, y_pred_test)
        mape_test = MAPE(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)

        # 모델 저장
        model_file = f'model_{reg_name}.joblib'
        joblib.dump(reg, model_file)

        scores[reg_name] = {
            'cv_rmse' : cv_rmse,
            'cv_rmse_std' : cv_rmse_std,
            'Test RMSE': rmse_test,
            'Test MAE': mae_test,
            'Test MAPE': mape_test,
            'Test R2': r2_test
        }
        
        result_test[f'y_pred_test_{reg_name}'] = y_pred_test
             
        predictions[reg_name] = y_pred_test

    lasso_pred = predictions['Lasso']
    lgbm_pred = predictions['LGBMRegressor']
    rf_pred = predictions['RandomForestRegressor']
    average_three = (lasso_pred + lgbm_pred + rf_pred) / 3
    average_lgbm_rf = (lgbm_pred+rf_pred) /2 
    average_lgbm_la = (lgbm_pred + lasso_pred) /2
    average_rf_la = (lasso_pred+rf_pred) / 2

    result_test['y_pred_avg_three'] = average_three
    result_test['y_pred_avg_lgbm_rf'] = average_lgbm_rf
    result_test['y_pred_avg_lgbm_la'] = average_lgbm_la
    result_test['y_pred_avg_rf_la'] = average_rf_la
    
    scores_df = pd.DataFrame(scores).transpose()

    # train, test 예측치 저장
    result_test.to_csv('prediction_results_test_set_day.csv', index=False, encoding="cp949")

    return scores_df

scores_df = execute_pipeline(X_train, y_train, X_test, y_test)
print(scores_df)


                          cv_rmse  cv_rmse_std   Test RMSE    Test MAE   
LinearRegression       853.072271   239.626638  786.546614  614.769712  \
Lasso                  852.803157   239.897717  785.201487  613.665010   
LGBMRegressor          906.323730   235.837656  791.607392  601.192139   
RandomForestRegressor  920.137535   230.636252  816.143654  631.095944   

                       Test MAPE   Test R2  
LinearRegression        3.741362  0.315393  
Lasso                   3.734612  0.317733  
LGBMRegressor           3.618362  0.306555  
RandomForestRegressor   3.791393  0.262902  


### MAE 모델별 비교

In [None]:
# # 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
# predict = pd.read_csv('prediction_results_test_set_day.csv', encoding = "cp949")

# predict['MAE_LinearRegression'] = abs(predict['y_pred_test_LinearRegression'] - predict['y_test'])
# predict['MAE_Lasso'] = abs(predict['y_pred_test_Lasso'] - predict['y_test'])

# predict['MAE_LGBMRegressor'] = abs(predict['y_pred_test_LGBMRegressor'] - predict['y_test'])
# predict['MAE_RandomForestRegressor'] = abs(predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])

# predict['MAE_avg_three'] = abs(predict['y_pred_avg_three'] - predict['y_test'])
# predict['MAE_avg_lgbm_rf'] = abs(predict['y_pred_avg_lgbm_rf'] - predict['y_test'])
# predict['MAE_avg_lgbm_la'] = abs(predict['y_pred_avg_lgbm_la'] - predict['y_test'])
# predict['MAE_avg_rf_la'] = abs(predict['y_pred_avg_rf_la'] - predict['y_test'])


# result = predict.groupby(['holiday_yn']).agg({
#   'MAE_LinearRegression': np.mean,
#   'MAE_Lasso': np.mean,
#   'MAE_LGBMRegressor': np.mean,
#   'MAE_RandomForestRegressor': np.mean,
#   'MAE_avg_three' : np.mean,
#   'MAE_avg_lgbm_rf' : np.mean,
#   'MAE_avg_lgbm_la' : np.mean,
#   'MAE_avg_rf_la' : np.mean 
# }).reset_index()

# print(result)


In [39]:
# 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
predict = pd.read_csv('prediction_results_test_set_day.csv', encoding = "cp949")

predict['MAE_LinearRegression'] = abs(predict['y_pred_test_LinearRegression'] - predict['y_test'])
predict['MAE_Lasso'] = abs(predict['y_pred_test_Lasso'] - predict['y_test'])

predict['MAE_LGBMRegressor'] = abs(predict['y_pred_test_LGBMRegressor'] - predict['y_test'])
predict['MAE_RandomForestRegressor'] = abs(predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])

predict['MAE_avg_three'] = abs(predict['y_pred_avg_three'] - predict['y_test'])
predict['MAE_avg_lgbm_rf'] = abs(predict['y_pred_avg_lgbm_rf'] - predict['y_test'])
predict['MAE_avg_lgbm_la'] = abs(predict['y_pred_avg_lgbm_la'] - predict['y_test'])
predict['MAE_avg_rf_la'] = abs(predict['y_pred_avg_rf_la'] - predict['y_test'])

result = predict.agg({
  'MAE_LinearRegression': np.mean,
  'MAE_Lasso': np.mean,
  'MAE_LGBMRegressor': np.mean,
  'MAE_RandomForestRegressor': np.mean,
  'MAE_avg_three' : np.mean,
  'MAE_avg_lgbm_rf' : np.mean,
  'MAE_avg_lgbm_la' : np.mean,
  'MAE_avg_rf_la' : np.mean 
}).reset_index()

print(result)


                       index           0
0       MAE_LinearRegression  614.769712
1                  MAE_Lasso  613.665010
2          MAE_LGBMRegressor  601.192139
3  MAE_RandomForestRegressor  631.095944
4              MAE_avg_three  575.734307
5            MAE_avg_lgbm_rf  607.011034
6            MAE_avg_lgbm_la  573.138024
7              MAE_avg_rf_la  577.539547


### MAPE 모델별 비교

In [40]:
predict = pd.read_csv('prediction_results_test_set_day.csv', encoding = "cp949")

predict['MAPE_LinearRegression'] = abs((predict['y_pred_test_LinearRegression'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_Lasso'] = abs((predict['y_pred_test_Lasso'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_LGBMRegressor'] = abs((predict['y_pred_test_LGBMRegressor'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_RandomForestRegressor'] = abs((predict['y_pred_test_RandomForestRegressor'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_avg_three'] = abs((predict['y_pred_avg_three'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_avg_lgbm_rf'] = abs((predict['y_pred_avg_lgbm_rf'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_avg_lgbm_la'] = abs((predict['y_pred_avg_lgbm_la'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_avg_rf_la'] = abs((predict['y_pred_avg_rf_la'] - predict['y_test']) / predict['y_test']) * 100

result = predict.agg({
    'MAPE_LinearRegression': np.mean,
    'MAPE_Lasso': np.mean,
    'MAPE_LGBMRegressor': np.mean,
    'MAPE_RandomForestRegressor': np.mean,
    'MAPE_avg_three' : np.mean,
    'MAPE_avg_lgbm_rf' : np.mean,
    'MAPE_avg_lgbm_la' : np.mean,
    'MAPE_avg_rf_la' : np.mean 
}).reset_index()

print(result)


                        index         0
0       MAPE_LinearRegression  3.741362
1                  MAPE_Lasso  3.734612
2          MAPE_LGBMRegressor  3.618362
3  MAPE_RandomForestRegressor  3.791393
4              MAPE_avg_three  3.471523
5            MAPE_avg_lgbm_rf  3.647380
6            MAPE_avg_lgbm_la  3.467944
7              MAPE_avg_rf_la  3.485911


### RMSE 모델 비교 

In [41]:
# 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
predict = pd.read_csv('prediction_results_test_set_day.csv', encoding = "cp949")

predict['SquaredError_LinearRegression'] = (predict['y_pred_test_LinearRegression'] - predict['y_test'])**2
predict['SquaredError_Lasso'] = (predict['y_pred_test_Lasso'] - predict['y_test'])**2
predict['SquaredError_LGBMRegressor'] = (predict['y_pred_test_LGBMRegressor'] - predict['y_test'])**2
predict['SquaredError_RandomForestRegressor'] = (predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])**2
predict['SquaredError_avg_three'] = (predict['y_pred_avg_three'] - predict['y_test'])**2
predict['SquaredError_avg_lgbm_rf'] = (predict['y_pred_avg_lgbm_rf'] - predict['y_test'])**2
predict['SquaredError_avg_lgbm_la'] = (predict['y_pred_avg_lgbm_la'] - predict['y_test'])**2
predict['SquaredError_avg_rf_la'] = (predict['y_pred_avg_rf_la'] - predict['y_test'])**2

# 차이의 제곱의 평균을 구한 후 제곱근 계산
mean_squared_errors = predict.agg({
  'SquaredError_LinearRegression': np.mean,
  'SquaredError_Lasso': np.mean,
  'SquaredError_LGBMRegressor': np.mean,
  'SquaredError_RandomForestRegressor': np.mean,
  'SquaredError_avg_three' : np.mean,
  'SquaredError_avg_lgbm_rf' : np.mean,
  'SquaredError_avg_lgbm_la' : np.mean,
  'SquaredError_avg_rf_la' : np.mean
}).reset_index()

mean_squared_errors.columns = ['Model', 'MeanSquaredError']

mean_squared_errors['RMSE'] = np.sqrt(mean_squared_errors['MeanSquaredError'])

print(mean_squared_errors)



                                Model  MeanSquaredError        RMSE
0       SquaredError_LinearRegression     618655.576085  786.546614
1                  SquaredError_Lasso     616541.375473  785.201487
2          SquaredError_LGBMRegressor     626642.262371  791.607392
3  SquaredError_RandomForestRegressor     666090.464007  816.143654
4              SquaredError_avg_three     574619.664875  758.036717
5            SquaredError_avg_lgbm_rf     621279.276747  788.212710
6            SquaredError_avg_lgbm_la     572052.283672  756.341380
7              SquaredError_avg_rf_la     576881.211013  759.526965


### 변수 중요도 파악 

### permutation importance -> RandomForest, LGBM

In [None]:
# permutation importance 

def calculate_permutation_importance(model, X_train, y_train, X_test, y_test):
    # train set에 대한 permutation importance 계산
    perm_importance_train = permutation_importance(model, X_train, y_train, n_repeats=5, random_state=42)

    # test set에 대한 permutation importance 계산
    perm_importance_test = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42)

    # importance 값 저장 (numpy array를 list로 변환)
    perm_importances_train = perm_importance_train.importances_mean.tolist()
    perm_importances_test = perm_importance_test.importances_mean.tolist()

    # 특성명과 importance 값 매핑
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'perm_importance_train': perm_importances_train,
        'perm_importance_test': perm_importances_test,
    })

    # importance 값에 따라 내림차순 정렬
    feature_importances = feature_importances.sort_values(by='perm_importance_train', ascending=False)

    # CSV 파일로 저장
    feature_importances.to_csv('feature_importances_lgbm.csv', index=False, encoding="cp949")
    
    return feature_importances

# LGBM 모델을 불러옴
lgbm_model = joblib.load('model_LGBMRegressor.joblib')

# permutation importance 계산
feature_importances = calculate_permutation_importance(lgbm_model, X_train, y_train, X_test, y_test)

#print(scores_df)
print(feature_importances)

In [None]:
# permutation importance 

def calculate_permutation_importance(model, X_train, y_train, X_test, y_test):
    perm_importance_train = permutation_importance(model, X_train, y_train, n_repeats=5, random_state=42)
    
    perm_importance_test = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42)

    perm_importances_train = perm_importance_train.importances_mean.tolist()
    perm_importances_test = perm_importance_test.importances_mean.tolist()

    # 특성명과 importance 값 매핑
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'perm_importance_train': perm_importances_train,
        'perm_importance_test': perm_importances_test,
    })

    feature_importances = feature_importances.sort_values(by='perm_importance_train', ascending=False)
    
    feature_importances.to_csv('feature_importances_Rf.csv', index=False, encoding="cp949")
    
    return feature_importances

lgbm_model = joblib.load('model_RandomForestRegressor.joblib')

feature_importances = calculate_permutation_importance(lgbm_model, X_train, y_train, X_test, y_test)

print(feature_importances)

In [None]:
lasso_model = joblib.load('model_Lasso.joblib')
lasso_coef = pd.Series(lasso_model.coef_, index=X_train.columns)
selected_feats_lasso = lasso_coef[lasso_coef!=0].index

print("Number of features selected by Lasso: ", len(selected_feats_lasso))
print("Features selected by Lasso: ", selected_feats_lasso)


In [None]:
def plot_feature_importances(model, model_name):
    importances = model.feature_importances_
    feat_names = X_train.columns
    feature_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)

    #print("Feature importances for ", model_name, " : ")
    #print(feature_imp)

    # Creating a bar plot
    plt.figure(figsize = (20,15))
    sns.barplot(x=feature_imp, y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    plt.legend()
    plt.show()

lgbm_model = joblib.load('model_LGBMRegressor.joblib')
plot_feature_importances(lgbm_model, 'LGBMRegressor')

rf_model = joblib.load('model_RandomForestRegressor.joblib')
plot_feature_importances(rf_model, 'RandomForestRegressor')
