# 1. dataset 

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import optuna
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
import statsmodels.api as sm 
from time import time

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [16]:
# 한글 폰트 경로 설정
font_path = '/System/Library/Fonts/AppleSDGothicNeo.ttc'
font_name = font_manager.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

In [17]:
data = pd.read_csv('combined_data.csv', encoding = "cp949")
data.head()

#data = data[data['pick_rgn2_nm']=='강남구']
print(data.shape) 

(194625, 13)


In [18]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194625 entries, 0 to 194624
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   pick_rgn2_nm   194625 non-null  object 
 1   rider_cnt      194625 non-null  float64
 2   datetime       194625 non-null  object 
 3   hour_reg       194625 non-null  int64  
 4   reg_date       194625 non-null  object 
 5   day_of_reg     194625 non-null  object 
 6   is_holiday     194625 non-null  int64  
 7   is_rain        194625 non-null  int64  
 8   rider_cnt_w_1  194625 non-null  float64
 9   rider_cnt_w_2  194625 non-null  float64
 10  rider_cnt_w_3  194625 non-null  float64
 11  rider_cnt_w_4  194625 non-null  float64
 12  order_cnt_w_1  194625 non-null  int64  
dtypes: float64(5), int64(4), object(4)
memory usage: 19.3+ MB
None
           rider_cnt       hour_reg     is_holiday        is_rain   
count  194625.000000  194625.000000  194625.000000  194625.000000  \
mean    

In [19]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["reg_date"] = pd.to_datetime(data["reg_date"])

data = data.sort_values(by="datetime")

In [None]:
# data = data.drop(columns = ['is_crm'])
# print(data.head())

In [None]:
# data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
#print(data.shape) 

In [20]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['day_of_reg','pick_rgn2_nm', 'hour_reg', 'is_holiday', 'is_rain'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


pick_rgn2_nm           category
rider_cnt               float64
datetime         datetime64[ns]
hour_reg               category
reg_date         datetime64[ns]
day_of_reg             category
is_holiday             category
is_rain                category
rider_cnt_w_1           float64
rider_cnt_w_2           float64
rider_cnt_w_3           float64
rider_cnt_w_4           float64
order_cnt_w_1             int64
dtype: object


# 2. 데이터 전처리

In [None]:
# numeric 변수 scale 
# scaler = StandardScaler()  #평균 0 , 분산 1로 조정
# #scaler = MinMaxScaler()

# num_vars = ['rider_cnt_2', 'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#             'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#             'order_cnt_w_4']
# data[num_vars] = scaler.fit_transform(data[num_vars])

# print(df.head(3))

## 2-1. one-hot-encoding

In [None]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [21]:
#df = data.drop(columns = ['rider_cnt_w_3','rider_cnt_w_4'])
df = data
var = ['day_of_reg','pick_rgn2_nm', 'hour_reg', 'is_holiday', 'is_rain']
encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, df.drop(columns=var)], axis=1)
#print(df.head(3))
print(df.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일',
       'pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'is_holiday_0', 'is_holiday_1', 'is_rain_0'

# 3. train, test set split

In [22]:
# train_ratio = 0.8
# total_samples = df.shape[0]
# train_samples = int(train_ratio * total_samples)
# df_train = df[:train_samples]
# df_test = df[train_samples:]

df_train = df[df["reg_date"]<= '2023-03-31']
df_test = df[df["reg_date"] >= '2023-04-01']

print(df_train['reg_date'].min()) #2023-01-04
print(df_train['reg_date'].max()) #2023-04-30
print(df_test['reg_date'].min()) #2023-05-01
print(df_test['reg_date'].max()) #2023-06-12

df_train = df_train.drop(columns = ['datetime', 'reg_date'])
df_test = df_test.drop(columns = ['datetime', 'reg_date'])
print(df_train.shape, df_test.shape) 

2022-02-01 00:00:00
2023-03-31 00:00:00
2023-04-01 00:00:00
2023-07-04 00:00:00
(159000, 57) (35625, 57)


In [23]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt'])
y_train = df_train['rider_cnt']

X_test = df_test.drop(columns=['rider_cnt'])
y_test = df_test['rider_cnt']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(159000, 56) (159000,) (35625, 56) (35625,)


In [None]:
print(X_train.columns)

### numeric_scale 

In [None]:
# # 입력 변수 
# numeric_cols = ['rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#                 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#                 'order_cnt_w_4']

# # scaler 
# scaler_X = StandardScaler()

# # X_train, X_test
# X_train_scaled = scaler_X.fit_transform(X_train[numeric_cols])
# X_test_scaled = scaler_X.transform(X_test[numeric_cols])

# # 스케일링된 결과를 DataFrame으로 변환
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index = X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=numeric_cols, index = X_test.index)

# # 원래의 범주형 변수들을 선택
# categorical_cols = [col for col in X_train.columns if col not in numeric_cols]
# X_train_cat = X_train[categorical_cols]
# X_test_cat = X_test[categorical_cols]

# # 스케일링된 DataFrame과 범주형 변수들을 병합
# X_train_final = pd.concat([X_train_scaled, X_train_cat], axis=1)
# X_test_final = pd.concat([X_test_scaled, X_test_cat], axis=1)



In [None]:
# 예측값
# y_train, y_test
# scaler_y = StandardScaler()
# y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
# y_train_scaled = y_train_scaled.ravel()
# y_test_scaled=  y_test_scaled.ravel()

# print(y_train_scaled.shape)
# print(y_test_scaled.shape)

# 3. regression - benchmark model

In [24]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

                            OLS Regression Results                            
Dep. Variable:              rider_cnt   R-squared:                       0.964
Model:                            OLS   Adj. R-squared:                  0.964
Method:                 Least Squares   F-statistic:                 8.279e+04
Date:                Wed, 05 Jul 2023   Prob (F-statistic):               0.00
Time:                        12:58:53   Log-Likelihood:            -7.9489e+05
No. Observations:              159000   AIC:                         1.590e+06
Df Residuals:                  158948   BIC:                         1.590e+06
Df Model:                          51                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                17.9467      0.21

In [None]:
# 선형 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 잔차 계산
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# 변수별 잔차 그래프 그리기
for column in X_test.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(x=X_test[column], y=residuals)
    plt.title(f'Residuals vs. {column}')
    plt.xlabel(column)
    plt.ylabel('Residuals')
    plt.show()


# 4.Machine Learning Modeling

## 4-1. 하이퍼파라미터 튜닝 - Grid Search 

### a. LightGBM model 

In [None]:
# classifier = LGBMRegressor()

# parameters = [{'learning_rate': [0.1, 0.05, 0.01, 0.005], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
#               {'learning_rate': [0.15, 0.125, 0.1, 0.075], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'num_leaves': [16, 32, 64]}]

# grid_search = GridSearchCV(estimator=classifier,
#                            param_grid=parameters,
#                            scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'],
#                            cv=10,
#                            n_jobs=-1,
#                            refit='neg_mean_squared_error')                     

# grid_search.fit(X_train, y_train)
# best_rmse = np.sqrt(-1 * grid_search.cv_results_['mean_test_neg_mean_squared_error'][grid_search.best_index_])
# best_mae = -1 * grid_search.cv_results_['mean_test_neg_mean_absolute_error'][grid_search.best_index_]
# best_parameters = grid_search.best_params_
# print("Best RMSE: {:.2f}".format(best_rmse))
# print("Best MAE: {:.2f}".format(best_mae))
# print("Best Parameters:", best_parameters)


In [None]:
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "num_leaves": trial.suggest_int("num_leaves", 16, 64),
    }

    model = LGBMRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = np.sqrt(-1 * np.mean(score))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE: {:.2f}".format(study.best_value))
print("Best Parameters:", study.best_params)

#Best RMSE: 32.67
#Best Parameters:  {'learning_rate': 0.03326315420009047, 'n_estimators': 177, 'max_depth': 5, 'num_leaves': 31}

### b. ridge regression

In [None]:
# Ridge Regression
ridge = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 2.0, 5.0, 10.0]}
ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
print("Ridge Best RMSE: {:.2f}".format(np.sqrt(-ridge_grid_search.best_score_)))
print("Ridge Best Parameters: ", ridge_grid_search.best_params_)

# Ridge Best RMSE: 27.83
# Ridge Best Parameters:  {'alpha': 10.0}

### c. Lasso regression

In [None]:
# Lasso Regression
lasso = Lasso(max_iter = 10000)
lasso_param_grid = {'alpha': [0.1, 1.0,2.0, 5.0, 10.0]}
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=lasso_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
print("Lasso Best RMSE: {:.2f}".format(np.sqrt(-lasso_grid_search.best_score_)))
print("Lasso Best Parameters: ", lasso_grid_search.best_params_)

# Lasso Best RMSE: 28.83
# Lasso Best Parameters:  {'alpha': 0.1}

### e. Random Forest Regressor

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        'min_samples_split': trial.suggest_int("min_samples_split", 2, 10),
    }

    model = RandomForestRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = np.sqrt(-1 * np.mean(score))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE: {:.2f}".format(study.best_value))
print("Best Parameters:", study.best_params)

#Best RMSE: 33.14
#Best Parameters: {'n_estimators': 112, 'max_depth': 7, 'min_samples_split': 4}

In [None]:
# Random Forest Regressor
# rfr = RandomForestRegressor(random_state=0)
# rfr_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 5, 10]}
# rfr_grid_search = GridSearchCV(estimator=rfr, param_grid=rfr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
# rfr_grid_search.fit(X_train, y_train)
# print("Random Forest Regressor Best RMSE: {:.2f}".format(np.sqrt(-rfr_grid_search.best_score_)))
# print("Random Forest Regressor Best Parameters: ", rfr_grid_search.best_params_)


## 4-2. train, test set 적용 

### a. train, test rmse, mae

In [25]:
def MAPE(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) /y_test)) *100

In [26]:
train_set = data[data["datetime"] <= '2023-03-31']
test_set = data[data["datetime"] >= '2023-04-01']

def execute_pipeline(X_train, y_train, X_test, y_test):
    regressors = [
        Lasso(alpha=0.1, max_iter=5000), 
        LGBMRegressor(learning_rate =  0.03326315420009047, n_estimators =  177, max_depth =  5, num_leaves = 31),
        RandomForestRegressor(random_state=0, max_depth=7, min_samples_split=4, n_estimators=112),
    ]

    result_test = pd.DataFrame({'datetime': test_set["reg_date"],
                                'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"], 
                                'is_rain': test_set["is_rain"], 'day_of_reg': test_set["day_of_reg"], "is_holiday" : test_set["is_holiday"],
                                'y_test': test_set["rider_cnt"]})
    
    scores = {}
    predictions = {}

    for reg in regressors:
        reg_name = reg.__class__.__name__
        scoring = {
            'rmse' : 'neg_root_mean_squared_error',
            'mae' : 'neg_mean_absolute_error',
            'r2' : 'r2'
        } 
        
        cv_results = cross_validate(reg, X_train, y_train, cv = 10, scoring = scoring)
        
        # Cross-validation RMSE and SD
        cv_rmse = -np.mean(cv_results['test_rmse'])
        cv_rmse_std = np.std(cv_results['test_rmse'])

        reg.fit(X_train, y_train)
        y_pred_test = reg.predict(X_test)
        
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_test = mean_absolute_error(y_test, y_pred_test)
        mape_test = MAPE(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)

        # 모델 저장
        model_file = f'model_{reg_name}.joblib'
        joblib.dump(reg, model_file)

        scores[reg_name] = {
            'cv_rmse' : cv_rmse,
            'cv_rmse_std' : cv_rmse_std,
            'Test RMSE': rmse_test,
            'Test MAE': mae_test,
            'Test MAPE': mape_test,
            'Test R2': r2_test
        }
         
        result_test[f'y_pred_test_{reg_name}'] = y_pred_test
             
        predictions[reg_name] = y_pred_test

    lasso_pred = predictions['Lasso']
    lgbm_pred = predictions['LGBMRegressor']
    rf_pred = predictions['RandomForestRegressor']
    average_three = (lasso_pred + lgbm_pred + rf_pred) / 3
    average_lgbm_rf = (lgbm_pred+rf_pred) /2 
    average_lgbm_la = (lgbm_pred + lasso_pred) /2
    average_rf_la = (lasso_pred+rf_pred) / 2

    result_test['y_pred_avg_three'] = average_three
    result_test['y_pred_avg_lgbm_rf'] = average_lgbm_rf
    result_test['y_pred_avg_lgbm_la'] = average_lgbm_la
    result_test['y_pred_avg_rf_la'] = average_rf_la

    scores_df = pd.DataFrame(scores).transpose()

    # train, test 예측치 저장
    result_test.to_csv('prediction_results_test_set.csv', index=False, encoding="cp949")

    return scores_df

scores_df = execute_pipeline(X_train, y_train, X_test, y_test)
print(scores_df)


                         cv_rmse  cv_rmse_std  Test RMSE   Test MAE   
Lasso                  35.368576    11.182028  34.239274  22.943499  \
LGBMRegressor          35.192488    10.862759  34.480149  23.061512   
RandomForestRegressor  35.667590    10.847214  35.577825  23.853034   

                       Test MAPE   Test R2  
Lasso                   7.922414  0.969098  
LGBMRegressor           7.954434  0.968661  
RandomForestRegressor   8.348121  0.966634  


In [27]:
# Calculate MAE, RMSE and MAPE for each model
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

metrics = ['MAE', 'RMSE', 'MAPE']
models = ['test_Lasso', 'test_LGBMRegressor', 'test_RandomForestRegressor', 'avg_three', 'avg_lgbm_rf', 'avg_lgbm_la', 'avg_rf_la']

results = []

for model in models:
    y_true = predict['y_test']
    y_pred = predict[f'y_pred_{model}']

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = MAPE(y_true, y_pred)

    results.append({
        'model': model,
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape
    })

result_df = pd.DataFrame(results)
print(result_df)


                        model        MAE       RMSE      MAPE
0                  test_Lasso  22.943499  34.239274  7.922414
1          test_LGBMRegressor  23.061512  34.480149  7.954434
2  test_RandomForestRegressor  23.853034  35.577825  8.348121
3                   avg_three  22.946941  34.233656  7.925225
4                 avg_lgbm_rf  23.192859  34.673114  8.039537
5                 avg_lgbm_la  22.809012  33.979166  7.860082
6                   avg_rf_la  23.096305  34.450128  7.993081


### 세분화하여 MAE 값 확인

In [None]:
# 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")


predict['MAE_Lasso'] = abs(predict['y_pred_test_Lasso'] - predict['y_test'])
predict['MAE_LGBMRegressor'] = abs(predict['y_pred_test_LGBMRegressor'] - predict['y_test'])
predict['MAE_RandomForestRegressor'] = abs(predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])

predict['MAE_avg_three'] = abs(predict['y_pred_avg_three'] - predict['y_test'])
predict['MAE_avg_lgbm_rf'] = abs(predict['y_pred_avg_lgbm_rf'] - predict['y_test'])
predict['MAE_avg_lgbm_la'] = abs(predict['y_pred_avg_lgbm_la'] - predict['y_test'])
predict['MAE_avg_rf_la'] = abs(predict['y_pred_avg_rf_la'] - predict['y_test'])

result = predict.groupby(['hour_reg']).agg({
    'MAE_Lasso': np.mean,
    'MAE_LGBMRegressor': np.mean,
    'MAE_RandomForestRegressor': np.mean,
    'MAE_avg_three' : np.mean,
    'MAE_avg_lgbm_rf' : np.mean,
    'MAE_avg_lgbm_la' : np.mean,
    'MAE_avg_rf_la' : np.mean
    
}).reset_index()

print(result)



In [None]:
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAPE_Lasso'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_test_Lasso']), axis=1)
predict['MAPE_LGBMRegressor'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_test_LGBMRegressor']), axis=1)
predict['MAPE_RandomForestRegressor'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_test_RandomForestRegressor']), axis=1)

predict['MAPE_avg_three'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_avg_three']), axis=1)
predict['MAPE_avg_lgbm_rf'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_avg_lgbm_rf']), axis=1)
predict['MAPE_avg_lgbm_la'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_avg_lgbm_la']), axis=1)
predict['MAPE_avg_rf_la'] = predict.apply(lambda row: MAPE(row['y_test'], row['y_pred_avg_rf_la']), axis=1)

result = predict.groupby(['hour_reg']).agg({
    'MAPE_Lasso': np.mean,
    'MAPE_LGBMRegressor': np.mean,
    'MAPE_RandomForestRegressor': np.mean,
    'MAPE_avg_three' : np.mean,
    'MAPE_avg_lgbm_rf' : np.mean,
    'MAPE_avg_lgbm_la' : np.mean,
    'MAPE_avg_rf_la' : np.mean
    
}).reset_index()

print(result)

### 변수 중요도 파악 

### permutation importance -> RandomForest, LGBM

In [None]:
# permutation importance 

def calculate_permutation_importance(model, X_train, y_train, X_test, y_test):
    # train set에 대한 permutation importance 계산
    perm_importance_train = permutation_importance(model, X_train, y_train, n_repeats=5, random_state=42)

    # test set에 대한 permutation importance 계산
    perm_importance_test = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42)

    # importance 값 저장 (numpy array를 list로 변환)
    perm_importances_train = perm_importance_train.importances_mean.tolist()
    perm_importances_test = perm_importance_test.importances_mean.tolist()

    # 특성명과 importance 값 매핑
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'perm_importance_train': perm_importances_train,
        'perm_importance_test': perm_importances_test,
    })

    # importance 값에 따라 내림차순 정렬
    feature_importances = feature_importances.sort_values(by='perm_importance_train', ascending=False)

    # CSV 파일로 저장
    feature_importances.to_csv('feature_importances_lgbm.csv', index=False, encoding="cp949")
    
    return feature_importances

# LGBM 모델을 불러옴
lgbm_model = joblib.load('model_LGBMRegressor.joblib')

# permutation importance 계산
feature_importances = calculate_permutation_importance(lgbm_model, X_train, y_train, X_test, y_test)

#print(scores_df)
print(feature_importances)

In [None]:
# permutation importance 

def calculate_permutation_importance(model, X_train, y_train, X_test, y_test):
    perm_importance_train = permutation_importance(model, X_train, y_train, n_repeats=5, random_state=42)
    
    perm_importance_test = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42)

    perm_importances_train = perm_importance_train.importances_mean.tolist()
    perm_importances_test = perm_importance_test.importances_mean.tolist()

    # 특성명과 importance 값 매핑
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'perm_importance_train': perm_importances_train,
        'perm_importance_test': perm_importances_test,
    })

    feature_importances = feature_importances.sort_values(by='perm_importance_train', ascending=False)
    
    feature_importances.to_csv('feature_importances_Rf.csv', index=False, encoding="cp949")
    
    return feature_importances

lgbm_model = joblib.load('model_RandomForestRegressor.joblib')

feature_importances = calculate_permutation_importance(lgbm_model, X_train, y_train, X_test, y_test)

print(feature_importances)

In [None]:
lasso_model = joblib.load('model_Lasso.joblib')
lasso_coef = pd.Series(lasso_model.coef_, index=X_train.columns)
selected_feats_lasso = lasso_coef[lasso_coef!=0].index

print("Number of features selected by Lasso: ", len(selected_feats_lasso))
print("Features selected by Lasso: ", selected_feats_lasso)


In [None]:
def plot_feature_importances(model, model_name):
    importances = model.feature_importances_
    feat_names = X_train.columns
    feature_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)

    #print("Feature importances for ", model_name, " : ")
    #print(feature_imp)

    # Creating a bar plot
    plt.figure(figsize = (20,15))
    sns.barplot(x=feature_imp, y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    plt.legend()
    plt.show()

lgbm_model = joblib.load('model_LGBMRegressor.joblib')
plot_feature_importances(lgbm_model, 'LGBMRegressor')

rf_model = joblib.load('model_RandomForestRegressor.joblib')
plot_feature_importances(rf_model, 'RandomForestRegressor')


### stacking model 

In [None]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Initialize models
lasso = Lasso()
lr = LinearRegression()
rf = RandomForestRegressor()

# Meta model
gbr = GradientBoostingRegressor()

# Stacking model
stack = StackingCVRegressor(regressors=(lasso, lr, rf),
                            meta_regressor=gbr,
                            use_features_in_secondary=True)

# Training the stacking classifier
stack.fit(X_train.values, y_train.values)

# Predict
y_pred = stack.predict(X_test.values)

# Evaluate the model
print('Stack Test RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
