# 1. dataset 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import optuna
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
import statsmodels.api as sm 
from time import time

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 한글 폰트 경로 설정
font_path = '/System/Library/Fonts/AppleSDGothicNeo.ttc'
font_name = font_manager.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

In [3]:
data = pd.read_csv('combined_data.csv', encoding = "cp949")
data.head()
print(data.shape) 

(44625, 14)


In [4]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44625 entries, 0 to 44624
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   day_of_reg         44625 non-null  object 
 1   pick_rgn2_nm       44625 non-null  object 
 2   rider_cnt          44625 non-null  float64
 3   datetime           44625 non-null  object 
 4   hour_reg           44625 non-null  int64  
 5   reg_date           44625 non-null  object 
 6   rain_group         44625 non-null  object 
 7   holiday_yn         44625 non-null  object 
 8   rider_cnt_w_1_new  44625 non-null  float64
 9   rider_cnt_w_2_new  44625 non-null  float64
 10  rider_cnt_w_3_new  44625 non-null  float64
 11  rider_cnt_w_4_new  44625 non-null  float64
 12  order_cnt_w_1_new  44625 non-null  int64  
 13  group_s            44625 non-null  object 
dtypes: float64(5), int64(2), object(7)
memory usage: 4.8+ MB
None
          rider_cnt      hour_reg  rider_cnt_w_1_new  ri

In [5]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["reg_date"] = pd.to_datetime(data["reg_date"])

data = data.sort_values(by="datetime")

In [None]:
# data = data.drop(columns = [''holiday_yn','hour_reg','day_of_reg','rain_group''])
# print(data.head())

In [6]:
# data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
#print(data.shape) 

day_of_reg           0
pick_rgn2_nm         0
rider_cnt            0
datetime             0
hour_reg             0
reg_date             0
rain_group           0
holiday_yn           0
rider_cnt_w_1_new    0
rider_cnt_w_2_new    0
rider_cnt_w_3_new    0
rider_cnt_w_4_new    0
order_cnt_w_1_new    0
group_s              0
dtype: int64

In [8]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg',  'holiday_yn' ,'rain_group', 'group_s'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


day_of_reg                 category
pick_rgn2_nm               category
rider_cnt                   float64
datetime             datetime64[ns]
hour_reg                   category
reg_date             datetime64[ns]
rain_group                 category
holiday_yn                 category
rider_cnt_w_1_new           float64
rider_cnt_w_2_new           float64
rider_cnt_w_3_new           float64
rider_cnt_w_4_new           float64
order_cnt_w_1_new             int64
group_s                    category
dtype: object


# 2. 데이터 전처리

In [None]:
# numeric 변수 scale 
# scaler = StandardScaler()  #평균 0 , 분산 1로 조정
# #scaler = MinMaxScaler()

# num_vars = ['rider_cnt_2', 'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#             'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#             'order_cnt_w_4']
# data[num_vars] = scaler.fit_transform(data[num_vars])

# print(df.head(3))

## 2-1. one-hot-encoding

In [None]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [9]:
#df = data.drop(columns = ['is_rain','is_holiday','day_of_reg'])
df = data
# var = ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'rain_group', 'holiday_yn']
# encoder = OneHotEncoder()
# onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
# df = pd.concat([onehot, df.drop(columns=var)], axis=1)
#print(df.head(3))
print(df.columns)

Index(['day_of_reg', 'pick_rgn2_nm', 'rider_cnt', 'datetime', 'hour_reg',
       'reg_date', 'rain_group', 'holiday_yn', 'rider_cnt_w_1_new',
       'rider_cnt_w_2_new', 'rider_cnt_w_3_new', 'rider_cnt_w_4_new',
       'order_cnt_w_1_new', 'group_s'],
      dtype='object')


# 3. train, test set split

In [10]:
# train_ratio = 0.8
# total_samples = df.shape[0]
# train_samples = int(train_ratio * total_samples)
# df_train = df[:train_samples]
# df_test = df[train_samples:]

df_train = df[df["datetime"]<= '2023-04-30']
df_test = df[df["datetime"] >= '2023-05-01']

# print(df_train['reg_date'].min()) #2022-01-29
# print(df_test['reg_date'].min()) #2023-02-15

# print(df_train['reg_date'].max()) #2023-02-15
# print(df_test['reg_date'].max()) #2023-05-21

df_train = df_train.drop(columns = ['datetime', 'reg_date', 'pick_rgn2_nm', 'day_of_reg','hour_reg','rain_group','holiday_yn','group_s'])
df_test = df_test.drop(columns = ['datetime', 'reg_date', 'pick_rgn2_nm', 'day_of_reg','hour_reg','rain_group','holiday_yn','group_s'])
print(df_train.shape, df_test.shape) # 126,000, 54375 / 33000 11250 

(33000, 6) (11250, 6)


In [11]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt'])
y_train = df_train['rider_cnt']

X_test = df_test.drop(columns=['rider_cnt'])
y_test = df_test['rider_cnt']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(33000, 5) (33000,) (11250, 5) (11250,)


In [12]:
print(X_train.columns)

Index(['rider_cnt_w_1_new', 'rider_cnt_w_2_new', 'rider_cnt_w_3_new',
       'rider_cnt_w_4_new', 'order_cnt_w_1_new'],
      dtype='object')


### numeric_scale 

In [None]:
# # 입력 변수 
# numeric_cols = ['rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#                 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#                 'order_cnt_w_4']

# # scaler 
# scaler_X = StandardScaler()

# # X_train, X_test
# X_train_scaled = scaler_X.fit_transform(X_train[numeric_cols])
# X_test_scaled = scaler_X.transform(X_test[numeric_cols])

# # 스케일링된 결과를 DataFrame으로 변환
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index = X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=numeric_cols, index = X_test.index)

# # 원래의 범주형 변수들을 선택
# categorical_cols = [col for col in X_train.columns if col not in numeric_cols]
# X_train_cat = X_train[categorical_cols]
# X_test_cat = X_test[categorical_cols]

# # 스케일링된 DataFrame과 범주형 변수들을 병합
# X_train_final = pd.concat([X_train_scaled, X_train_cat], axis=1)
# X_test_final = pd.concat([X_test_scaled, X_test_cat], axis=1)



In [None]:
# 예측값
# y_train, y_test
# scaler_y = StandardScaler()
# y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
# y_train_scaled = y_train_scaled.ravel()
# y_test_scaled=  y_test_scaled.ravel()

# print(y_train_scaled.shape)
# print(y_test_scaled.shape)

# 3. regression - benchmark model

In [13]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

                            OLS Regression Results                            
Dep. Variable:              rider_cnt   R-squared:                       0.985
Model:                            OLS   Adj. R-squared:                  0.985
Method:                 Least Squares   F-statistic:                 4.313e+05
Date:                Wed, 31 May 2023   Prob (F-statistic):               0.00
Time:                        19:17:06   Log-Likelihood:            -1.4438e+05
No. Observations:               33000   AIC:                         2.888e+05
Df Residuals:                   32994   BIC:                         2.888e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.7206      0.21

In [None]:
# 선형 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 잔차 계산
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# 변수별 잔차 그래프 그리기
for column in X_test.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(x=X_test[column], y=residuals)
    plt.title(f'Residuals vs. {column}')
    plt.xlabel(column)
    plt.ylabel('Residuals')
    plt.show()




# 4.Machine Learning Modeling

## 4-1. 하이퍼파라미터 튜닝 - Grid Search 

### a. LightGBM model 

In [None]:
classifier = LGBMRegressor()

parameters = [{'learning_rate': [0.1, 0.05, 0.01, 0.005], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
              {'learning_rate': [0.15, 0.125, 0.1, 0.075], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'num_leaves': [16, 32, 64]}]

grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'],
                           cv=10,
                           n_jobs=-1,
                           refit='neg_mean_squared_error')                     

grid_search.fit(X_train, y_train)
best_rmse = np.sqrt(-1 * grid_search.cv_results_['mean_test_neg_mean_squared_error'][grid_search.best_index_])
best_mae = -1 * grid_search.cv_results_['mean_test_neg_mean_absolute_error'][grid_search.best_index_]
best_parameters = grid_search.best_params_
print("Best RMSE: {:.2f}".format(best_rmse))
print("Best MAE: {:.2f}".format(best_mae))
print("Best Parameters:", best_parameters)

# best rmse : 27,91
# best mae : 17.55
# Best Parameters: {'learning_rate': 0.075, 'max_depth': 7, 'n_estimators': 50, 'num_leaves': 64}

In [None]:
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "num_leaves": trial.suggest_int("num_leaves", 16, 64),
    }

    model = LGBMRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = np.sqrt(-1 * np.mean(score))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE: {:.2f}".format(study.best_value))
print("Best Parameters:", study.best_params)

#Best RMSE: 29.84
#Best Parameters: {'learning_rate': 0.08593822799866623, 'n_estimators': 51, 'max_depth': 3, 'num_leaves': 37}

### b. ridge regression

In [None]:
# Ridge Regression
ridge = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 2.0, 5.0, 10.0]}
ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
print("Ridge Best RMSE: {:.2f}".format(np.sqrt(-ridge_grid_search.best_score_)))
print("Ridge Best Parameters: ", ridge_grid_search.best_params_)

# Ridge Best RMSE: 27.83
# Ridge Best Parameters:  {'alpha': 10.0}

### c. Lasso regression

In [None]:
# Lasso Regression
lasso = Lasso(max_iter = 10000)
lasso_param_grid = {'alpha': [0.1, 1.0,2.0, 5.0, 10.0]}
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=lasso_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
print("Lasso Best RMSE: {:.2f}".format(np.sqrt(-lasso_grid_search.best_score_)))
print("Lasso Best Parameters: ", lasso_grid_search.best_params_)

# Lasso Best RMSE: 28.83
# Lasso Best Parameters:  {'alpha': 0.1}

### d. Support vector regressor

In [None]:
# SVR
# svr = SVR()
# svr_param_grid = {'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'C': [0.1, 1.0, 10.0], 'gamma': ['scale', 'auto']}
# svr_grid_search = GridSearchCV(estimator=svr, param_grid=svr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
# svr_grid_search.fit(df_X, df_y)
# print("SVR Best RMSE: {:.2f}".format(np.sqrt(-svr_grid_search.best_score_)))
# print("SVR Best Parameters: ", svr_grid_search.best_params_)

### e. Random Forest Regressor

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        'min_samples_split': [2, 5, 10],
    }

    model = RandomForestRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = np.sqrt(-1 * np.mean(score))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE: {:.2f}".format(study.best_value))
print("Best Parameters:", study.best_params)


In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=0)
rfr_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 5, 10]}
rfr_grid_search = GridSearchCV(estimator=rfr, param_grid=rfr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
rfr_grid_search.fit(X_train, y_train)
print("Random Forest Regressor Best RMSE: {:.2f}".format(np.sqrt(-rfr_grid_search.best_score_)))
print("Random Forest Regressor Best Parameters: ", rfr_grid_search.best_params_)

# Random Forest Regressor Best RMSE: 28.9
# Random Forest Regressor Best Parameters:  {'max_depth': 9, 'min_samples_split': 2, 'n_estimators': 200}


### f. Decision Tree Regressor

In [None]:
# Decision Tree Regressor
dtr = DecisionTreeRegressor(random_state=0)
dtr_param_grid = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
dtr_grid_search = GridSearchCV(estimator=dtr, param_grid=dtr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
dtr_grid_search.fit(X_train, y_train)
print("Decision Tree Regressor Best RMSE: {:.2f}".format(np.sqrt(-dtr_grid_search.best_score_)))
print("Decision Tree Regressor Best Parameters: ", dtr_grid_search.best_params_)

# Decision Tree Regressor Best RMSE: 27.85
# Decision Tree Regressor Best Parameters:  {'max_depth': 7, 'min_samples_split': 2}

## 4-2. train, test set 적용 

### a. train, test rmse, mae

In [14]:
def MAPE(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) /y_test)) *100

In [18]:
train_set = data[data["datetime"] <= '2023-04-30']
test_set = data[data["datetime"] >= '2023-05-01']

def execute_pipeline(X_train, y_train, X_test, y_test):
    regressors = [
        LinearRegression(),
        #Ridge(alpha=10.0),
        Lasso(alpha=0.1, max_iter=5000),
        LGBMRegressor(learning_rate= 0.08593822799866623, max_depth=3, n_estimators=51, num_leaves=37), #, subsample= 0.8, random_state=2345),
        RandomForestRegressor(random_state=0, max_depth=9, min_samples_split=2, n_estimators=200),
        #DecisionTreeRegressor(random_state=0, max_depth=7, min_samples_split=2)
    ]

    # result_train = pd.DataFrame({'datetime': train_set["reg_date"], 'day_of_reg': train_set["day_of_reg"],
    #                              'pick_rgn2_nm': train_set["pick_rgn2_nm"], 'hour_reg': train_set["hour_reg"],
    #                              'is_rain': train_set["rain_group"],  "holiday_yn" : train_set["holiday_yn"],
    #                               'y_test': train_set["rider_cnt"]})

    # result_test = pd.DataFrame({'datetime': test_set["reg_date"],
    #                             'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"],
    #                             'is_rain': test_set["rain_group"], 'day_of_reg': test_set["day_of_reg"], 
    #                             'holiday_yn': test_set["holiday_yn"], 'y_test': test_set["rider_cnt"]})
    
    result_train = pd.DataFrame({'reg_date': train_set["reg_date"], 'day_of_reg': train_set["day_of_reg"],
                                 'pick_rgn2_nm': train_set["pick_rgn2_nm"], 'hour_reg': train_set["hour_reg"],
                                 'is_rain': train_set["rain_group"],  "holiday_yn" : train_set["holiday_yn"], "group_s" : train_set["group_s"],
                                  'y_test': train_set["rider_cnt"]})

    result_test = pd.DataFrame({'reg_date': test_set["reg_date"],
                                'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"],
                                'is_rain': test_set["rain_group"], 'day_of_reg': test_set["day_of_reg"], "group_s" : test_set["group_s"],
                                'holiday_yn': test_set["holiday_yn"], 'y_test': test_set["rider_cnt"]})
    
    

    scores = {}
    predictions = {}

    for reg in regressors:
        reg_name = reg.__class__.__name__
        scoring = {
            'rmse' : 'neg_root_mean_squared_error',
            'mae' : 'neg_mean_absolute_error',
            'r2' : 'r2'
        } 
        
        cv_results = cross_validate(reg, X_train, y_train, cv = 5, scoring = scoring)
        
        reg.fit(X_train, y_train)
        y_pred_train = reg.predict(X_train)
        y_pred_test = reg.predict(X_test)
        
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mape_train = MAPE(y_train, y_pred_train)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_test = mean_absolute_error(y_test, y_pred_test)
        mape_test = MAPE(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)

        # 모델 저장
        model_file = f'model_{reg_name}.joblib'
        joblib.dump(reg, model_file)

        scores[reg_name] = {
            'Train RMSE': rmse_train,
            'Train MAE': mae_train,
            'Train MAPE': mape_train,
            'Test RMSE': rmse_test,
            'Test MAE': mae_test,
            'Test MAPE': mape_test,
            'Test R2': r2_test
        }
         
        result_train[f'y_pred_train_{reg_name}'] = y_pred_train
        result_test[f'y_pred_test_{reg_name}'] = y_pred_test
             
        predictions[reg_name] = y_pred_test

    lasso_pred = predictions['Lasso']
    lgbm_pred = predictions['LGBMRegressor']
    rf_pred = predictions['RandomForestRegressor']
    average_pred1 = (lasso_pred + lgbm_pred + rf_pred) / 3
    average_pred2 = (lgbm_pred+rf_pred) /2 

    result_test['y_pred_test_avg1'] = average_pred1
    result_test['y_pred_test_avg2'] = average_pred2

    scores_df = pd.DataFrame(scores).transpose()

    # train, test 예측치 저장
    result_train.to_csv('prediction_results_train_set.csv', index=False, encoding="cp949")
    result_test.to_csv('prediction_results_test_set.csv', index=False, encoding="cp949")

    return scores_df

scores_df = execute_pipeline(X_train, y_train, X_test, y_test)
print(scores_df)


                       Train RMSE  Train MAE  Train MAPE  Test RMSE   
LinearRegression        19.226402  12.734823    7.276264  31.635375  \
Lasso                   19.226405  12.734365    7.275994  31.634513   
LGBMRegressor           18.894190  12.732869    7.746841  31.870802   
RandomForestRegressor   15.954123  11.119732    6.454530  31.803560   

                        Test MAE  Test MAPE   Test R2  
LinearRegression       20.823388  10.270980  0.960769  
Lasso                  20.822484  10.270934  0.960772  
LGBMRegressor          20.827192  10.407691  0.960183  
RandomForestRegressor  20.684278  10.188820  0.960351  


### 세분화하여 MAE 값 확인

In [20]:
# 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAE_LinearRegression'] = abs(predict['y_pred_test_LinearRegression'] - predict['y_test'])
#predict['MAE_Ridge'] = abs(predict['y_pred_test_Ridge'] - predict['y_test'])
predict['MAE_Lasso'] = abs(predict['y_pred_test_Lasso'] - predict['y_test'])

predict['MAE_LGBMRegressor'] = abs(predict['y_pred_test_LGBMRegressor'] - predict['y_test'])
predict['MAE_RandomForestRegressor'] = abs(predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])
#predict['MAE_DecisionTreeRegressor'] = abs(predict['y_pred_test_DecisionTreeRegressor'] - predict['y_test'])

predict['MAE_avg1'] = abs(predict['y_pred_test_avg1'] - predict['y_test'])
predict['MAE_avg2'] = abs(predict['y_pred_test_avg2'] - predict['y_test'])

# 'day_of_reg2', 'is_rain', 'is_holiday' 별로 차이의 평균 계산
result = predict.groupby(['holiday_yn']).agg({
    'MAE_LinearRegression': np.mean,
  # 'MAE_Ridge': np.mean,
   'MAE_Lasso': np.mean,
    'MAE_LGBMRegressor': np.mean,
   'MAE_RandomForestRegressor': np.mean,
  # 'MAE_DecisionTreeRegressor': np.mean,
    'MAE_avg1' : np.mean,
    'MAE_avg2' : np.mean
}).reset_index()

print(result)


  holiday_yn  MAE_LinearRegression  MAE_Lasso  MAE_LGBMRegressor   
0          N             19.892954  19.891887          19.826622  \
1          Y             22.994399  22.993878          23.161853   

   MAE_RandomForestRegressor   MAE_avg1   MAE_avg2  
0                  19.667679  19.501239  19.463883  
1                  23.056341  22.968814  23.018640  


In [21]:
# 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAE_LinearRegression'] = abs(predict['y_pred_test_LinearRegression'] - predict['y_test'])
#predict['MAE_Ridge'] = abs(predict['y_pred_test_Ridge'] - predict['y_test'])
predict['MAE_Lasso'] = abs(predict['y_pred_test_Lasso'] - predict['y_test'])

predict['MAE_LGBMRegressor'] = abs(predict['y_pred_test_LGBMRegressor'] - predict['y_test'])
predict['MAE_RandomForestRegressor'] = abs(predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])
#predict['MAE_DecisionTreeRegressor'] = abs(predict['y_pred_test_DecisionTreeRegressor'] - predict['y_test'])

predict['MAE_avg1'] = abs(predict['y_pred_test_avg1'] - predict['y_test'])
predict['MAE_avg2'] = abs(predict['y_pred_test_avg2'] - predict['y_test'])

# 'day_of_reg2', 'is_rain', 'is_holiday' 별로 차이의 평균 계산
result = predict.groupby(['group_s']).agg({
    'MAE_LinearRegression': np.mean,
 # 'MAE_Ridge': np.mean,
    'MAE_Lasso': np.mean,
    'MAE_LGBMRegressor': np.mean,
    'MAE_RandomForestRegressor': np.mean,
  # 'MAE_DecisionTreeRegressor': np.mean,
    'MAE_avg1' : np.mean,
    'MAE_avg2' : np.mean
}).reset_index()

print(result)


  group_s  MAE_LinearRegression  MAE_Lasso  MAE_LGBMRegressor   
0       A             19.695271  19.694492          19.581014  \
1       B             23.846623  23.839780          24.738797   
2       D             25.953448  25.952061          26.136753   
3       E             22.431017  22.429824          22.609394   
4       F             22.947020  22.947906          23.090985   

   MAE_RandomForestRegressor   MAE_avg1   MAE_avg2  
0                  19.433423  19.266407  19.214868  
1                  24.352806  24.197884  24.444185  
2                  25.446537  25.745324  25.725719  
3                  22.096825  22.288626  22.266485  
4                  23.858802  23.176957  23.369873  


## 세분화하여 MAPE 확인 

In [23]:
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAPE_LinearRegression'] = abs((predict['y_pred_test_LinearRegression'] - predict['y_test']) / predict['y_test']) * 100
#predict['MAPE_Ridge'] = abs((predict['y_pred_test_Ridge'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_Lasso'] = abs((predict['y_pred_test_Lasso'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_LGBMRegressor'] = abs((predict['y_pred_test_LGBMRegressor'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_RandomForestRegressor'] = abs((predict['y_pred_test_RandomForestRegressor'] - predict['y_test']) / predict['y_test']) * 100
#predict['MAPE_DecisionTreeRegressor'] = abs((predict['y_pred_test_DecisionTreeRegressor'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_avg1'] = abs((predict['y_pred_test_avg1'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_avg2'] = abs((predict['y_pred_test_avg2'] - predict['y_test']) / predict['y_test']) * 100

# 'is_rain', 'is_holiday' 별로 차이의 평균 계산
result = predict.groupby(['is_rain', 'holiday_yn']).agg({
    'MAPE_LinearRegression': np.mean,
 #   'MAPE_Ridge': np.mean,
    'MAPE_Lasso': np.mean,
    'MAPE_LGBMRegressor': np.mean,
    'MAPE_RandomForestRegressor': np.mean,
  #  'MAPE_DecisionTreeRegressor': np.mean,
    'MAPE_avg1' : np.mean,
    'MAPE_avg2' : np.mean
}).reset_index()

print(result)


  is_rain holiday_yn  MAPE_LinearRegression  MAPE_Lasso  MAPE_LGBMRegressor   
0      no          N              10.525525   10.525494           10.676970  \
1      no          Y               9.188866    9.188555            9.183857   
2  normal          Y              11.769056   11.770800           12.451111   
3    weak          N              14.076375   14.079295           14.601694   
4    weak          Y               9.960339    9.960009           10.087638   

   MAPE_RandomForestRegressor  MAPE_avg1  MAPE_avg2  
0                   10.373349  10.275223  10.278640  
1                    9.129719   9.114364   9.109426  
2                   12.409498  12.186508  12.407299  
3                   14.494997  14.391995  14.548345  
4                   10.153642   9.968042  10.046717  


In [24]:
# 데이터 프레임에서 각 모델의 예측 값 - 실제 값 계산
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAE_LinearRegression'] = abs(predict['y_pred_test_LinearRegression'] - predict['y_test'])
#predict['MAE_Ridge'] = abs(predict['y_pred_test_Ridge'] - predict['y_test'])
predict['MAE_Lasso'] = abs(predict['y_pred_test_Lasso'] - predict['y_test'])

predict['MAE_LGBMRegressor'] = abs(predict['y_pred_test_LGBMRegressor'] - predict['y_test'])
predict['MAE_RandomForestRegressor'] = abs(predict['y_pred_test_RandomForestRegressor'] - predict['y_test'])
#predict['MAE_DecisionTreeRegressor'] = abs(predict['y_pred_test_DecisionTreeRegressor'] - predict['y_test'])

predict['MAE_avg1'] = abs(predict['y_pred_test_avg1'] - predict['y_test'])
predict['MAE_avg2'] = abs(predict['y_pred_test_avg2'] - predict['y_test'])

# 'day_of_reg2', 'is_rain', 'is_holiday' 별로 차이의 평균 계산
result = predict.groupby(['hour_reg']).agg({
    'MAE_LinearRegression': np.mean,
 # 'MAE_Ridge': np.mean,
   # 'MAE_Lasso': np.mean,
    'MAE_LGBMRegressor': np.mean,
    'MAE_RandomForestRegressor': np.mean,
  # 'MAE_DecisionTreeRegressor': np.mean,
    'MAE_avg1' : np.mean,
    #'MAE_avg2' : np.mean
}).reset_index()

print(result)



    hour_reg  MAE_LinearRegression  MAE_LGBMRegressor   
0          9              7.981128           8.377598  \
1         10             14.362534          14.174430   
2         11             22.438103          22.519827   
3         12             26.070571          26.058307   
4         13             23.276154          23.024150   
5         14             18.109090          17.831404   
6         15             25.182044          24.732254   
7         16             29.922451          29.740573   
8         17             25.413069          25.548596   
9         18             24.528547          25.241206   
10        19             25.539450          26.092858   
11        20             24.391195          24.474336   
12        21             18.657794          18.577984   
13        22             14.914803          14.739810   
14        23             11.563883          11.274541   

    MAE_RandomForestRegressor   MAE_avg1  
0                    8.099452   7.875542  
1

In [25]:
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAPE_LinearRegression'] = abs((predict['y_pred_test_LinearRegression'] - predict['y_test']) / predict['y_test']) * 100
#predict['MAPE_Ridge'] = abs((predict['y_pred_test_Ridge'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_Lasso'] = abs((predict['y_pred_test_Lasso'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_LGBMRegressor'] = abs((predict['y_pred_test_LGBMRegressor'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_RandomForestRegressor'] = abs((predict['y_pred_test_RandomForestRegressor'] - predict['y_test']) / predict['y_test']) * 100
#predict['MAPE_DecisionTreeRegressor'] = abs((predict['y_pred_test_DecisionTreeRegressor'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_avg1'] = abs((predict['y_pred_test_avg1'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_avg2'] = abs((predict['y_pred_test_avg2'] - predict['y_test']) / predict['y_test']) * 100

# 'is_rain', 'is_holiday' 별로 차이의 평균 계산
result = predict.groupby(['hour_reg']).agg({
  #  'MAPE_LinearRegression': np.mean,
 #   'MAPE_Ridge': np.mean,
  #  'MAPE_Lasso': np.mean,
    'MAPE_LGBMRegressor': np.mean,
    'MAPE_RandomForestRegressor': np.mean,
  #  'MAPE_DecisionTreeRegressor': np.mean,
    'MAPE_avg1' : np.mean,
   # 'MAPE_avg2' : np.mean
}).reset_index()

print(result)

    hour_reg  MAPE_LGBMRegressor  MAPE_RandomForestRegressor  MAPE_avg1
0          9           18.533187                   15.315426  15.406791
1         10           11.723156                   11.795857  11.678733
2         11            8.668042                    8.487917   8.527072
3         12            9.327791                    9.183245   9.118059
4         13           10.330048                   10.214789  10.061526
5         14           10.841223                   10.562843  10.551633
6         15           14.030200                   14.133227  13.945063
7         16           13.780959                   13.776001  13.631230
8         17            8.702517                    8.721177   8.646823
9         18            6.664656                    6.583648   6.559165
10        19            6.783724                    6.801368   6.703643
11        20            7.913888                    7.948906   7.861540
12        21            8.845167                    8.832365   8

In [None]:
predict = pd.read_csv('prediction_results_test_set.csv', encoding = "cp949")

predict['MAPE_LinearRegression'] = abs((predict['y_pred_test_LinearRegression'] - predict['y_test']) / predict['y_test']) * 100
#predict['MAPE_Ridge'] = abs((predict['y_pred_test_Ridge'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_Lasso'] = abs((predict['y_pred_test_Lasso'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_LGBMRegressor'] = abs((predict['y_pred_test_LGBMRegressor'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_RandomForestRegressor'] = abs((predict['y_pred_test_RandomForestRegressor'] - predict['y_test']) / predict['y_test']) * 100
#predict['MAPE_DecisionTreeRegressor'] = abs((predict['y_pred_test_DecisionTreeRegressor'] - predict['y_test']) / predict['y_test']) * 100

predict['MAPE_avg1'] = abs((predict['y_pred_test_avg1'] - predict['y_test']) / predict['y_test']) * 100
predict['MAPE_avg2'] = abs((predict['y_pred_test_avg2'] - predict['y_test']) / predict['y_test']) * 100

# 'is_rain', 'is_holiday' 별로 차이의 평균 계산
result = predict.groupby(['group_s']).agg({
    'MAPE_LinearRegression': np.mean,
 #   'MAPE_Ridge': np.mean,
    'MAPE_Lasso': np.mean,
    'MAPE_LGBMRegressor': np.mean,
    'MAPE_RandomForestRegressor': np.mean,
  #  'MAPE_DecisionTreeRegressor': np.mean,
    'MAPE_avg1' : np.mean,
    'MAPE_avg2' : np.mean
}).reset_index()

print(result)


### 변수 중요도 파악 

### permutation importance -> RandomForest, LGBM

In [None]:
# permutation importance 

def calculate_permutation_importance(model, X_train, y_train, X_test, y_test):
    # train set에 대한 permutation importance 계산
    perm_importance_train = permutation_importance(model, X_train, y_train, n_repeats=5, random_state=42)

    # test set에 대한 permutation importance 계산
    perm_importance_test = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42)

    # importance 값 저장 (numpy array를 list로 변환)
    perm_importances_train = perm_importance_train.importances_mean.tolist()
    perm_importances_test = perm_importance_test.importances_mean.tolist()

    # 특성명과 importance 값 매핑
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'perm_importance_train': perm_importances_train,
        'perm_importance_test': perm_importances_test,
    })

    # importance 값에 따라 내림차순 정렬
    feature_importances = feature_importances.sort_values(by='perm_importance_train', ascending=False)

    # CSV 파일로 저장
    feature_importances.to_csv('feature_importances_lgbm.csv', index=False, encoding="cp949")
    
    return feature_importances

# LGBM 모델을 불러옴
lgbm_model = joblib.load('model_LGBMRegressor.joblib')

# permutation importance 계산
feature_importances = calculate_permutation_importance(lgbm_model, X_train, y_train, X_test, y_test)

#print(scores_df)
print(feature_importances)

In [None]:
# permutation importance 

def calculate_permutation_importance(model, X_train, y_train, X_test, y_test):
    perm_importance_train = permutation_importance(model, X_train, y_train, n_repeats=5, random_state=42)
    
    perm_importance_test = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42)

    perm_importances_train = perm_importance_train.importances_mean.tolist()
    perm_importances_test = perm_importance_test.importances_mean.tolist()

    # 특성명과 importance 값 매핑
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'perm_importance_train': perm_importances_train,
        'perm_importance_test': perm_importances_test,
    })

    feature_importances = feature_importances.sort_values(by='perm_importance_train', ascending=False)
    
    feature_importances.to_csv('feature_importances_Rf.csv', index=False, encoding="cp949")
    
    return feature_importances

lgbm_model = joblib.load('model_RandomForestRegressor.joblib')

feature_importances = calculate_permutation_importance(lgbm_model, X_train, y_train, X_test, y_test)

print(feature_importances)

In [None]:
lasso_model = joblib.load('model_Lasso.joblib')
lasso_coef = pd.Series(lasso_model.coef_, index=X_train.columns)
selected_feats_lasso = lasso_coef[lasso_coef!=0].index

print("Number of features selected by Lasso: ", len(selected_feats_lasso))
print("Features selected by Lasso: ", selected_feats_lasso)




In [None]:
def plot_feature_importances(model, model_name):
    importances = model.feature_importances_
    feat_names = X_train.columns
    feature_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)

    #print("Feature importances for ", model_name, " : ")
    #print(feature_imp)

    # Creating a bar plot
    plt.figure(figsize = (20,15))
    sns.barplot(x=feature_imp, y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    plt.legend()
    plt.show()

lgbm_model = joblib.load('model_LGBMRegressor.joblib')
plot_feature_importances(lgbm_model, 'LGBMRegressor')

rf_model = joblib.load('model_RandomForestRegressor.joblib')
plot_feature_importances(rf_model, 'RandomForestRegressor')
