### Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score, classification_report

from lightgbm import LGBMRegressor
      # swap for RandomForestRegressor / XGBRegressor if desired

In [2]:
df_main = pd.read_csv(r'C:\Users\USER\OneDrive\Documents\DSC_Project\20250517_資料集\一般訂單\main.csv')

  df_main = pd.read_csv(r'C:\Users\USER\OneDrive\Documents\DSC_Project\20250517_資料集\一般訂單\main.csv')


### Feature engineering

In [3]:
df_date_only = df_main['createdAt']
df_date_only = pd.to_datetime(df_date_only)
df_date_only =df_date_only.dt.date

df_main['only_date'] = df_date_only
df_main['only_date']

0        2021-12-14
1        2021-12-14
2        2021-12-14
3        2021-12-14
4        2021-12-14
            ...    
27808    2021-12-14
27809    2021-12-14
27810    2021-12-14
27811    2021-12-14
27812    2021-12-14
Name: only_date, Length: 27813, dtype: object

In [4]:
# 透過訂單編號，去計算顧客再下訂單的時間
df_main = df_main.sort_values(['user_oid', 'only_date'])
df_main['only_date'] = pd.to_datetime(df_main['only_date'])
df_main['next_order_date'] =  df_main.groupby('user_oid')['only_date'].shift(-1)
df_main['next_order_date'] = pd.to_datetime(df_main['next_order_date'])

df_main['days_until_next_order'] = (df_main['next_order_date'] - df_main['only_date']).dt.days
df_main['days_until_next_order']

25380    3.0
25384    3.0
25399    1.0
25403    4.0
25446    2.0
        ... 
19822    NaN
19823    NaN
19824    NaN
19825    NaN
19828    NaN
Name: days_until_next_order, Length: 27813, dtype: float64

### Data preperation

In [None]:
#list the columns you plan to learn from
num_cols = ['subtotal', 'travelingExpense', 'discountNum']

cat_cols = ['defaultPaymentMethod', 'mode', 'serviceClassificationName', 'state',
            'address_county', 'address_district', 'discount_type', 'isServicePackageBooking']

X = df_main[num_cols + cat_cols].copy()      # features only
y = df_main['days_until_next_order'].copy()
# y = y.notna().astype(int)
# y = (df_main["days_until_next_order"] > 0).astype(int)
y = y.apply(lambda x: 0 if pd.isna(x) else 1)
groups = df_main["user_oid"]


### Minimal preprocessing block

In [6]:
# ⬇️ assumes you have already defined `num_cols` and `cat_cols`
prep = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', Pipeline(steps=[
            ('imp', SimpleImputer(strategy='constant', fill_value='missing')),
            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
        ]), cat_cols)
    ],
    remainder='drop'      # drop any column not listed above
)

### Model and Pipline

In [7]:
lgbm = LGBMRegressor(objective='binary',
                     n_estimators=400,
                     learning_rate=0.05,
                     num_leaves=63,
                     random_state=42)

pipe = Pipeline([('prep', prep), ('model', lgbm)])

### Train / Test split

In [8]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

### Fit and Eval

In [9]:
pipe.fit(X.iloc[train_idx], y.iloc[train_idx])

pred = pipe.predict(X.iloc[test_idx])
print(f"RMSE : {mean_squared_error(y.iloc[test_idx], pred):.2f}")
print(f"MAE  : {mean_absolute_error(y.iloc[test_idx], pred):.2f}")


[LightGBM] [Info] Number of positive: 8018, number of negative: 14236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001921 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 572
[LightGBM] [Info] Number of data points in the train set: 22254, number of used features: 149
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.360295 -> initscore=-0.574085
[LightGBM] [Info] Start training from score -0.574085
RMSE : 0.19
MAE  : 0.36


