# 1. dataset 

In [73]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import ast 
import statsmodels.api as sm 
from time import time

# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [74]:
data = pd.read_csv('combined_data.csv', encoding = "cp949")
data.head()
print(data.shape) #186,000

(186000, 27)


In [75]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186000 entries, 0 to 185999
Data columns (total 27 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   pick_rgn2_nm   186000 non-null  object 
 1   rider_cnt      186000 non-null  int64  
 2   order_cnt      186000 non-null  int64  
 3   datetime       186000 non-null  object 
 4   hour_reg       186000 non-null  int64  
 5   reg_date       186000 non-null  object 
 6   day_of_reg     186000 non-null  object 
 7   temp_c         186000 non-null  float64
 8   rain_c         186000 non-null  float64
 9   snow_c         186000 non-null  float64
 10  is_rain        186000 non-null  int64  
 11  month          186000 non-null  int64  
 12  week           186000 non-null  int64  
 13  is_holiday     186000 non-null  int64  
 14  q1             186000 non-null  float64
 15  q3             186000 non-null  float64
 16  IQR1.5         186000 non-null  float64
 17  outlier        186000 non-nul

In [76]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["reg_date"] = pd.to_datetime(data["reg_date"])

data = data.sort_values(by="datetime")

In [77]:
data = data.drop(columns = ['rider_cnt', 'order_cnt','temp_c','rain_c', 
                            'snow_c', 'q1', 'q3', 'IQR1.5', 'outlier'])
print(data.head())

   pick_rgn2_nm            datetime  hour_reg   reg_date day_of_reg  is_rain   
0           강남구 2022-01-01 09:00:00         9 2022-01-01        토요일        0  \
24          중랑구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   
23           중구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   
22          종로구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   
21          은평구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   

    month  week  is_holiday  rider_cnt_2  rider_cnt_w_1  rider_cnt_w_2   
0       1     1           1        215.0            NaN            NaN  \
24      1     1           1         25.0            NaN            NaN   
23      1     1           1         21.0            NaN            NaN   
22      1     1           1         23.0            NaN            NaN   
21      1     1           1         53.0            NaN            NaN   

    rider_cnt_w_3  rider_cnt_w_4  order_cnt_w_1  order_cnt_w_2  order_cnt_

In [78]:
data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
print(data.shape) #165,125

(165125, 18)


In [79]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'is_rain', 'month', 'week', 'is_holiday'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


pick_rgn2_nm           category
datetime         datetime64[ns]
hour_reg               category
reg_date         datetime64[ns]
day_of_reg             category
is_rain                category
month                  category
week                   category
is_holiday             category
rider_cnt_2             float64
rider_cnt_w_1           float64
rider_cnt_w_2           float64
rider_cnt_w_3           float64
rider_cnt_w_4           float64
order_cnt_w_1           float64
order_cnt_w_2           float64
order_cnt_w_3           float64
order_cnt_w_4           float64
dtype: object


# 2. 데이터 전처리 

## 2-1. one-hot-encoding

In [None]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [80]:
var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'is_rain', 'month','week','is_holiday']

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, data.drop(columns=var)], axis=1)
print(df.head(3))


       pick_rgn2_nm_강남구  pick_rgn2_nm_강동구  pick_rgn2_nm_강북구  pick_rgn2_nm_강서구   
10517               0.0               0.0               0.0               0.0  \
10514               0.0               0.0               0.0               0.0   
10515               0.0               0.0               0.0               0.0   

       pick_rgn2_nm_관악구  pick_rgn2_nm_광진구  pick_rgn2_nm_구로구  pick_rgn2_nm_금천구   
10517               0.0               0.0               0.0               0.0  \
10514               0.0               0.0               0.0               0.0   
10515               0.0               0.0               0.0               0.0   

       pick_rgn2_nm_노원구  pick_rgn2_nm_도봉구  ...   reg_date  rider_cnt_2   
10517               0.0               0.0  ... 2022-01-29        132.0  \
10514               0.0               0.0  ... 2022-01-29         63.0   
10515               0.0               0.0  ... 2022-01-29         39.0   

       rider_cnt_w_1  rider_cnt_w_2  rider_cnt_w_3  r

## 2-2. numeric variable scale 

In [None]:
# numeric 변수 scale 
scaler = StandardScaler()  #평균 0 , 분산 1로 조정
#scaler = MinMaxScaler()

# num_vars = ['rider_cnt_2', 'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#             'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#             'order_cnt_w_4']
# df[num_vars] = scaler.fit_transform(df[num_vars])

# print(df.head(3))


# 3. train/test set split

In [81]:
#np.random.seed(1234)
#df_train, df_test = train_test_split(dataset,train_size =0.75, test_size = 0.25, random_state = 100)

df_train = df[df["datetime"]<= '2022-12-31']
df_test = df[df["datetime"] >= '2023-01-01']

df_train = df_train.drop(columns = ['datetime', 'reg_date'])
df_test = df_test.drop(columns = ['datetime', 'reg_date'])

print(df_train.shape, df_test.shape) #116,050, 48700

(116050, 77) (48700, 77)


In [82]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt_2'])
y_train = df_train['rider_cnt_2']

X_test = df_test.drop(columns=['rider_cnt_2'])
y_test = df_test['rider_cnt_2']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(116050, 76) (116050,) (48700, 76) (48700,)


# 3. regression - benchmark model

In [None]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

# 4.Machine Learning Modeling

## 4-1. cross-validation 

In [None]:
def my_regressor(X_train, y_train, scoring=('neg_mean_squared_error', 'neg_mean_absolute_error')):
    linear = LinearRegression()
    ridge = Ridge(alpha = 1.0)
    lasso = Lasso(alpha = 0.1, max_iter = 10000)
    lgbm = LGBMRegressor(learning_rate=0.15, max_depth=7, n_estimators=200, num_leaves=64)
    random = RandomForestRegressor(n_estimators=10, random_state=0)
    decision = DecisionTreeRegressor(random_state=0)

    models = {'linear': linear,
              'ridge': ridge,
              'lasso': lasso,
              'lgbm': lgbm,
              'random': random,
              'decision': decision}
  
    score_dic = {}
    for model_name, model in models.items():
        scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=10, n_jobs=-1)
        rmse_score = np.sqrt(-np.mean(scores['test_neg_mean_squared_error']))
        mae_score = -np.mean(scores['test_neg_mean_absolute_error'])
        score_dic[str(model.__class__.__name__)] = {'RMSE': rmse_score, 'MAE': mae_score}
   
    sorted_scores_dic = sorted(score_dic.items(), key=lambda t: t[1]['RMSE'])
    result = pd.DataFrame(columns=['RMSE Score', 'MAE Score'])
    for model_name, scores in sorted_scores_dic:
        result.loc[model_name] = [scores['RMSE'], scores['MAE']]
    return result

result_df = my_regressor(X_train, y_train)
print(result_df)

## 4-2. 하이퍼파라미터 튜닝 - Grid Search 

### a. LightGBM model 

In [None]:
classifier = LGBMRegressor()

parameters = [{'learning_rate': [0.1, 0.05, 0.01, 0.005], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
              {'learning_rate': [0.15, 0.125, 0.1, 0.075], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'num_leaves': [16, 32, 64]}]

grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'],
                           cv=10,
                           n_jobs=-1,
                           refit='neg_mean_squared_error')                     

grid_search.fit(X_train, y_train)
best_rmse = np.sqrt(-1 * grid_search.cv_results_['mean_test_neg_mean_squared_error'][grid_search.best_index_])
best_mae = -1 * grid_search.cv_results_['mean_test_neg_mean_absolute_error'][grid_search.best_index_]
best_parameters = grid_search.best_params_
print("Best RMSE: {:.2f}".format(best_rmse))
print("Best MAE: {:.2f}".format(best_mae))
print("Best Parameters:", best_parameters)

# best rmse : 24.87
# best mae : 16.59
# Best Parameters: {'learning_rate': 0.15, 'max_depth': 7, 'n_estimators': 50, 'num_leaves': 64}

### b. ridge regression

In [None]:
sample_size = int(len(X_train) * 0.5)
df_X = X_train.sample(sample_size, random_state=0)
df_y = y_train.sample(sample_size, random_state=0)

# Ridge Regression
ridge = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 10.0]}
ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
ridge_grid_search.fit(df_X, df_y)
print("Ridge Best RMSE: {:.2f}".format(np.sqrt(-ridge_grid_search.best_score_)))
print("Ridge Best Parameters: ", ridge_grid_search.best_params_)

# Ridge Best RMSE: 23.72
# Ridge Best Parameters:  {'alpha': 1.0}

### c. Lasso regression

In [None]:
# Lasso Regression
lasso = Lasso(max_iter = 10000)
lasso_param_grid = {'alpha': [0.1, 1.0, 10.0]}
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=lasso_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
lasso_grid_search.fit(df_X, df_y)
print("Lasso Best RMSE: {:.2f}".format(np.sqrt(-lasso_grid_search.best_score_)))
print("Lasso Best Parameters: ", lasso_grid_search.best_params_)

# Lasso Best RMSE: 24.19
# Lasso Best Parameters:  {'alpha': 0.1}

### d. Support vector regressor

In [None]:
# SVR
# svr = SVR()
# svr_param_grid = {'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'C': [0.1, 1.0, 10.0], 'gamma': ['scale', 'auto']}
# svr_grid_search = GridSearchCV(estimator=svr, param_grid=svr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
# svr_grid_search.fit(df_X, df_y)
# print("SVR Best RMSE: {:.2f}".format(np.sqrt(-svr_grid_search.best_score_)))
# print("SVR Best Parameters: ", svr_grid_search.best_params_)

### e. Random Forest Regressor

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=0)
rfr_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
rfr_grid_search = GridSearchCV(estimator=rfr, param_grid=rfr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
rfr_grid_search.fit(df_X, df_y)
print("Random Forest Regressor Best RMSE: {:.2f}".format(np.sqrt(-rfr_grid_search.best_score_)))
print("Random Forest Regressor Best Parameters: ", rfr_grid_search.best_params_)

### f. Decision Tree Regressor

In [None]:
# Decision Tree Regressor
dtr = DecisionTreeRegressor(random_state=0)
dtr_param_grid = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
dtr_grid_search = GridSearchCV(estimator=dtr, param_grid=dtr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
dtr_grid_search.fit(df_X, df_y)
print("Decision Tree Regressor Best RMSE: {:.2f}".format(np.sqrt(-dtr_grid_search.best_score_)))
print("Decision Tree Regressor Best Parameters: ", dtr_grid_search.best_params_)


## 4-3. train, test set 적용 

### a. train, test rmse, mae

In [85]:
train_set = data[data["datetime"] <= '2022-12-31']
test_set = data[data["datetime"] >= '2023-01-01']

def execute_pipeline(X_train, y_train, X_test, y_test):
    regressors = [
        LinearRegression(),
        # Ridge(alpha = 1.0),
        # Lasso(alpha = 0.1, max_iter=10000),
        # RandomForestRegressor(n_estimators=10, random_state=0),
        # DecisionTreeRegressor(random_state=0),
        LGBMRegressor(learning_rate=0.15, max_depth=7, n_estimators=200, num_leaves=64)
    ]
    
    result_train = pd.DataFrame({'datetime': train_set["datetime"],
                              'pick_rgn2_nm': train_set["pick_rgn2_nm"], 'hour_reg': train_set["hour_reg"],
                              'day_of_reg': train_set["day_of_reg"], 'is_rain': train_set["is_rain"],
                              'is_holiday': train_set["is_holiday"], 'y_test': y_train})
    
    result_test = pd.DataFrame({'datetime': test_set["datetime"],
                              'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"],
                              'day_of_reg': test_set["day_of_reg"], 'is_rain': test_set["is_rain"],
                              'is_holiday': test_set["is_holiday"], 'y_test': y_test})
    
    scores = {}
    for reg in regressors:
        reg_name = reg.__class__.__name__
        cv_scores = cross_val_score(reg, X_train, y_train, cv = 10, scoring='neg_root_mean_squared_error')
        mean_rmse = -1.0 * np.mean(cv_scores)
        std_rmse = np.std(cv_scores)
        
        reg.fit(X_train, y_train)
        y_pred_train = reg.predict(X_train)
        y_pred_test = reg.predict(X_test)
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mae_train = mean_absolute_error(y_train, y_pred_train)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_test = mean_absolute_error(y_test, y_pred_test)

        scores[reg_name] = {
            'CV RMSE Mean': mean_rmse,
            'CV RMSE Std': std_rmse,
            'Train RMSE': rmse_train,
            'Train MAE': mae_train,
            'Test RMSE': rmse_test,
            'Test MAE': mae_test
        }
        result_train[f'y_pred_train_{reg.__class__.__name__}'] = y_pred_train
        result_test[f'y_pred_test_{reg.__class__.__name__}'] = y_pred_test
        
    scores_df = pd.DataFrame(scores).transpose()
    
    result_train.to_csv('prediction_results_train_set.csv', index=False, encoding="cp949")
    result_test.to_csv('prediction_results_test_set.csv', index=False, encoding="cp949")
    
    return scores_df

# Example usage
scores_df = execute_pipeline(X_train, y_train, X_test, y_test)
print(scores_df)


                  CV RMSE Mean  CV RMSE Std  Train RMSE  Train MAE  Test RMSE   
LinearRegression     25.420777     5.225326   23.641602  16.015795  22.412126  \
LGBMRegressor        24.629457     4.606525   15.935964  11.480302  22.795690   

                   Test MAE  
LinearRegression  16.016783  
LGBMRegressor     15.501312  


## 다음주 6일 예측하기

In [None]:
data = pd.read_csv('predict_data.csv', encoding = "cp949")
data.head()
#print(data.shape) #188,250

In [None]:
print(data.info())

In [None]:
#data["datetime"] = pd.to_datetime(data["datetime"])
data["reg_date"] = pd.to_datetime(data["reg_date"])
data = data.sort_values(by="reg_date")

In [None]:
data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
print(data.shape) #167,375

In [None]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'is_rain', 'month', 'week', 'is_holiday'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)

In [None]:
var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'is_rain', 'month','week','is_holiday']

data = data.sort_values(by="reg_date")
encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, data.drop(columns=var)], axis=1)
print(df.head(3))


### train/test set split 

In [None]:
df_train = df[df["reg_date"]< '2023-05-16']
df_test = df[df["reg_date"] >= '2023-05-16']

df_train = df_train.drop(columns = ['reg_date'])
df_test = df_test.drop(columns = ['reg_date'])

print(df_train.shape, df_test.shape) #165,125, 2250

In [None]:
X_train = df_train.drop(columns=['rider_cnt_2'])
y_train = df_train['rider_cnt_2']

X_test = df_test.drop(columns=['rider_cnt_2'])

print(X_train.shape, y_train.shape, X_test.shape) 

In [None]:

#X_train.isna().sum()
y_train.isna().sum()
#X_test.isna().sum()

In [None]:
train_set = data[data["reg_date"] < '2023-05-16']
train_set = train_set.sort_values(by="reg_date")

def get_predict(X_train, y_train, X_test):
    regressors = [
        LinearRegression(),
        LGBMRegressor(learning_rate=0.15, max_depth=7, n_estimators=200, num_leaves=64)
    ]

    result_df = pd.DataFrame({'reg_date': train_set["reg_date"],
                              'pick_rgn2_nm': train_set["pick_rgn2_nm"], 'hour_reg': train_set["hour_reg"],
                              'day_of_reg': train_set["day_of_reg"], 'is_rain': train_set["is_rain"],
                              'is_holiday': train_set["is_holiday"], 'y_test': y_train})


    for reg in regressors:
        reg.fit(X_train, y_train)
        y_pred_test = reg.predict(X_train)
        y_pred_std = np.std(y_pred_test)
        confidence_interval = y_pred_std * 1.96  # 95% 신뢰 구간 계산 (Z-score: 1.96)
        lower_bound = y_pred_test - confidence_interval
        upper_bound = y_pred_test + confidence_interval
        result_df[f'y_pred_test_{reg.__class__.__name__}'] = y_pred_test
        result_df[f'lower_bound_{reg.__class__.__name__}'] = lower_bound
        result_df[f'upper_bound_{reg.__class__.__name__}'] = upper_bound

    result_df.to_csv('prediction_results_total_train.csv', index=False, encoding="cp949")

get_predict(X_train, y_train, X_test)

In [None]:
test_set = data[data["reg_date"] >= '2023-05-16']
test_set = test_set.sort_values(by="reg_date")


def get_predict(X_train, y_train, X_test):
    regressors = [
        LinearRegression(),
        LGBMRegressor(learning_rate=0.15, max_depth=7, n_estimators=200, num_leaves=64)
    ]

    result_df = pd.DataFrame({'reg_date': test_set["reg_date"], 
                              'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"],
                              'day_of_reg': test_set["day_of_reg"], 'is_rain': test_set["is_rain"],
                              'is_holiday': test_set["is_holiday"]})



    for reg in regressors:
        reg.fit(X_train, y_train)
        y_pred_test = reg.predict(X_test)
        y_pred_std = np.std(y_pred_test)
        confidence_interval = y_pred_std * 1.96  # 95% 신뢰 구간 계산 (Z-score: 1.96)
        lower_bound = y_pred_test - confidence_interval
        upper_bound = y_pred_test + confidence_interval
        result_df[f'y_pred_test_{reg.__class__.__name__}'] = y_pred_test
        result_df[f'lower_bound_{reg.__class__.__name__}'] = lower_bound
        result_df[f'upper_bound_{reg.__class__.__name__}'] = upper_bound

    result_df.to_csv('prediction_results_latest6days.csv', index=False, encoding="cp949")

get_predict(X_train, y_train, X_test)
