In [1]:
import pandas as pd
final_dataset_path ="C:\\Users\\Administrator\\Desktop\\OLIST!\\olist_final_analysis_dataset.csv"
df_final = pd.read_csv(
    final_dataset_path,
    parse_dates=[
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'shipping_limit_date'],
    dtype={
        'customer_zip_code_prefix': 'object',
        'seller_zip_code_prefix': 'object',
        'payment_sequential': 'object',
        'payment_installments': 'object',})
print("--- “数据泰坦”已成功，载入“新的战场”！ ---")
print(f"当前维度: {df_final.shape}")
df_final.info()

--- “数据泰坦”已成功，载入“新的战场”！ ---
当前维度: (118434, 40)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118434 entries, 0 to 118433
Data columns (total 40 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   order_id                              118434 non-null  object        
 1   customer_id                           118434 non-null  object        
 2   order_status                          118434 non-null  object        
 3   order_purchase_timestamp              118434 non-null  datetime64[ns]
 4   order_approved_at                     118258 non-null  datetime64[ns]
 5   order_delivered_carrier_date          116360 non-null  datetime64[ns]
 6   order_delivered_customer_date         115037 non-null  datetime64[ns]
 7   order_estimated_delivery_date         118434 non-null  object        
 8   payment_sequential                    118431 non-null  object        
 9   payment_type

In [2]:
print(f"清洗前，我们的舰队总规模为: {len(df_final)} 行")
df_cleaned = df_final[df_final['order_status'] == 'delivered'].copy()
df_cleaned.dropna(subset=['order_delivered_customer_date'], inplace=True)
print(f"清洗后，我们的核心主力舰队规模为: {len(df_cleaned)} 行")
print("\n清洗后，订单状态分布情况:")
print(df_cleaned['order_status'].value_counts())

清洗前，我们的舰队总规模为: 118434 行
清洗后，我们的核心主力舰队规模为: 115030 行

清洗后，订单状态分布情况:
order_status
delivered    115030
Name: count, dtype: int64


In [3]:
#客户感知的“总等待时长”（单位：天
df_cleaned['total_wait_time'] = (df_cleaned['order_delivered_customer_date'] - df_cleaned[
    'order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)
## 特征X1: 支付确认时长（单位：天）
df_cleaned['payment_processing_time'] = (df_cleaned['order_approved_at'] - df_cleaned[
    'order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)
#特征X2: 卖家发货时长（单位：天）
df_cleaned['seller_dispatch_time'] = (df_cleaned['order_delivered_carrier_date'] - df_cleaned[
    'order_approved_at']).dt.total_seconds() / (24 * 3600)
## 特征X3: 物流运输时长（单位：天）
df_cleaned['carrier_delivery_time'] = (df_cleaned['order_delivered_customer_date'] - df_cleaned[
    'order_delivered_carrier_date']).dt.total_seconds() / (24 * 3600)
# 创造“时间模式特征”
df_cleaned['purchase_month'] = df_cleaned['order_purchase_timestamp'].dt.month
df_cleaned['purchase_day_of_week'] = df_cleaned['order_purchase_timestamp'].dt.dayofweek
df_cleaned['purchase_hour'] = df_cleaned['order_purchase_timestamp'].dt.hour
print("--- “时间特征”(终极版)已成功创造并装载！ ---")
print(df_cleaned[[
    'total_wait_time',
    'payment_processing_time',
    'seller_dispatch_time',
    'carrier_delivery_time']].describe())

--- “时间特征”(终极版)已成功创造并装载！ ---
       total_wait_time  payment_processing_time  seller_dispatch_time  \
count    115030.000000            115015.000000         115014.000000   
mean         12.484484                 0.434623              2.848934   
std           9.455074                 0.872448              3.585229   
min           0.533414                 0.000000           -171.219005   
25%           6.748252                 0.008993              0.885035   
50%          10.196956                 0.014444              1.842373   
75%          15.544230                 0.621626              3.640055   
max         209.628611                30.893484            125.762569   

       carrier_delivery_time  
count          115029.000000  
mean                9.200396  
std                 8.638785  
min               -16.096169  
25%                 4.063727  
50%                 7.062257  
75%                11.927234  
max               205.190972  


In [4]:
import numpy as np
def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371
    return c * r
df_cleaned['distance_km'] = haversine_distance(
    df_cleaned['geolocation_lat_customer'],
    df_cleaned['geolocation_lng_customer'],
    df_cleaned['geolocation_lat_seller'],
    df_cleaned['geolocation_lng_seller'])
df_cleaned['is_same_state'] = np.where(df_cleaned['customer_state'] == df_cleaned['seller_state'], 1, 0)
print("--- “空间特征”已成功，创造并装载！ ---")
print(df_cleaned[[
    'customer_state',
    'seller_state',
    'is_same_state',
    'distance_km']].head())
print("\n新特征的统计摘要:")
print(df_cleaned[['distance_km', 'is_same_state']].describe())

--- “空间特征”已成功，创造并装载！ ---
  customer_state seller_state  is_same_state  distance_km
0             SP           SP              1    18.576110
1             SP           SP              1    18.576110
2             SP           SP              1    18.576110
3             BA           SP              0   851.495069
4             GO           SP              0   514.410666

新特征的统计摘要:
         distance_km  is_same_state
count  114476.000000  115030.000000
mean      596.865139       0.360993
std       588.425951       0.480291
min         0.000000       0.000000
25%       187.957386       0.000000
50%       432.208957       0.000000
75%       791.960222       1.000000
max      8677.911622       1.000000


通过对买家与卖家地理位置的量化与分析，我们得到了以下三个足以颠覆我们初步认知的战略级洞察：
1. “超长半径”的商业版图：
   * 数据事实: 我们计算出的distance_km（买卖物理距离）的平均值，高达惊人的596公里。
   * 商业洞察:
     这彻底打破了我们对“本地化电商”的幻想。我们所运营的，是一个以“远距离、跨区域”交易为绝对主流的、“全国性
     ”，甚至，“全球性”的商业平台。这个客观事实，是我们理解其“高昂物流成本”与“漫长履约时长”的最根本的前提。
2. “跨州交易”的绝对主导：
    * 数据事实: 我们创造的is_same_state（是否同州）特征的均值，仅为0.36。
    * 商业洞察: 这意味着，我们高达64%的订单，都是需要进行“跨州”的长途运输！这进一步印证了我们商业模式的“广
      域性”，也为我们那“平均12.48天”的“高昂时间成本”，提供了最直接的解释。    
3. “全球化”的蛛丝马迹：
    * 数据事实: distance_km的最大值，达到了恐怖的8677公里。
    * 商业洞察: 这个极端异常值，不再是“噪音”。它是一个强烈的信号，暗示着Olist的业务触角，可能已经伸向了海外
      。这为我们未来去进行“用户国别分析”或“国际物流挑战”等更宏大的命题，埋下了最有趣的伏笔。  

In [7]:
df_cleaned['product_volume_cm3'] = df_cleaned['product_length_cm'].fillna(0) * \
                                  df_cleaned['product_height_cm'].fillna(0) * \
                                  df_cleaned['product_width_cm'].fillna(0)
#这里要进行数据降维，因为我们的数据粒度是订单-商品-支付，但是在数据拼接的时候发现有一对多的情况，不是单一对应关系
agg_logic = {
    'product_weight_g': 'sum',
    'product_volume_cm3': 'sum',
    'payment_value': 'sum',
    'order_item_id': 'max'}
df_order_aggregated = df_cleaned.groupby('order_id').agg(agg_logic).reset_index()
df_order_aggregated.rename(columns={
    'product_weight_g': 'total_weight_g',
    'product_volume_cm3': 'total_volume_cm3',
    'payment_value': 'total_payment_value',
    'order_item_id': 'total_order_items'}, inplace=True)
print("--- “商业微观特征”已成功，创造并聚合！ ---")
print(f"聚合后的订单级宽表维度: {df_order_aggregated.shape}")
print(df_order_aggregated.head())
print("\n新特征的统计摘要:")
print(df_order_aggregated[['total_weight_g', 'total_volume_cm3', 'total_payment_value','total_order_items']].describe())
                           

--- “商业微观特征”已成功，创造并聚合！ ---
聚合后的订单级宽表维度: (96470, 5)
                           order_id  total_weight_g  total_volume_cm3  \
0  00010242fe8c5a6d1ba2dd792cb16214           650.0            3528.0   
1  00018f77f2f0320c557190d7a144bdd3         30000.0           60000.0   
2  000229ec398224ef6ca0657da4fc703e          3050.0           14157.0   
3  00024acbcdf0a6daa1e931b038114c75           200.0            2400.0   
4  00042b26cf59d7ce69dfabb4e55b4fd9          3750.0           42000.0   

   total_payment_value  total_order_items  
0                72.19                1.0  
1               259.83                1.0  
2               216.87                1.0  
3                25.78                1.0  
4               218.04                1.0  

新特征的统计摘要:
       total_weight_g  total_volume_cm3  total_payment_value  \
count    96470.000000      9.647000e+04         96470.000000   
mean      2512.791324      1.824703e+04           204.983741   
std       5669.329910      3.664595e+04    

In [8]:
df_order_level_features = df_cleaned[[
    'order_id', 'customer_id', 'customer_unique_id', 'customer_state',
    'seller_id', 'seller_state',
    'total_wait_time', 'payment_processing_time',
    'seller_dispatch_time', 'carrier_delivery_time',
    'purchase_month', 'purchase_day_of_week', 'purchase_hour',
    'distance_km', 'is_same_state']].drop_duplicates(subset=['order_id'])
df_final_analysis = pd.merge(
    left=df_order_level_features,
    right=df_order_aggregated,
    on='order_id',
    how='inner')
print("--- “究极数据母舰”已，组装，完毕！ ---")
print(f"最终分析宽表的维度: {df_final_analysis.shape}")
print("\n最终分析宽表，数据信息:")
df_final_analysis.info()

--- “究极数据母舰”已，组装，完毕！ ---
最终分析宽表的维度: (96470, 19)

最终分析宽表，数据信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96470 entries, 0 to 96469
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   order_id                 96470 non-null  object 
 1   customer_id              96470 non-null  object 
 2   customer_unique_id       96470 non-null  object 
 3   customer_state           96470 non-null  object 
 4   seller_id                96470 non-null  object 
 5   seller_state             96470 non-null  object 
 6   total_wait_time          96470 non-null  float64
 7   payment_processing_time  96456 non-null  float64
 8   seller_dispatch_time     96455 non-null  float64
 9   carrier_delivery_time    96469 non-null  float64
 10  purchase_month           96470 non-null  int32  
 11  purchase_day_of_week     96470 non-null  int32  
 12  purchase_hour            96470 non-null  int32  
 13  distance_km   

In [9]:
print("--- 清洗前，各特征缺失值报告 ---")
missing_values = df_final_analysis.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print(missing_values)

--- 清洗前，各特征缺失值报告 ---
distance_km                478
seller_dispatch_time        15
payment_processing_time     14
carrier_delivery_time        1
dtype: int64


In [10]:
columns_to_drop_na = [
    'total_wait_time',
    'payment_processing_time',
    'seller_dispatch_time',
    'carrier_delivery_time',
    'distance_km']
df_cleaned_final = df_final_analysis.dropna(subset=columns_to_drop_na)
print(f"\n--- “核心缺失值”处理完毕！ ---")
print(f"处理前，我们的舰队规模为: {len(df_final_analysis)} 行")
print(f"处理后，我们的最终主力舰队规模为: {len(df_cleaned_final)} 行")
print("\n处理后，剩余“伤病”（缺失值）报告:")
remaining_missing = df_cleaned_final.isnull().sum()
print(remaining_missing[remaining_missing > 0])


--- “核心缺失值”处理完毕！ ---
处理前，我们的舰队规模为: 96470 行
处理后，我们的最终主力舰队规模为: 95977 行

处理后，剩余“伤病”（缺失值）报告:
Series([], dtype: int64)


In [11]:
features_to_encode = [
    'customer_state',
    'seller_state',
    'purchase_month',
    'purchase_day_of_week',
    'purchase_hour',
    'is_same_state',
    'total_weight_g',
    'total_volume_cm3',
    'total_payment_value',
    'total_order_items',
    'payment_processing_time',
    'seller_dispatch_time',
    'carrier_delivery_time',
    'distance_km']
target_variable = 'total_wait_time'
X = df_cleaned_final[features_to_encode]
y = df_cleaned_final[target_variable]
X_encoded = pd.get_dummies(X, drop_first=True, dtype=float)
print("--- “最终分析宽表”已，完成“数字化”与“编码”！ ---")
print(f"编码后，我们的特征矩阵X的维度: {X_encoded.shape}")
print("\n编码后，特征矩阵X，数据信息:")
X_encoded.info()

--- “最终分析宽表”已，完成“数字化”与“编码”！ ---
编码后，我们的特征矩阵X的维度: (95977, 59)

编码后，特征矩阵X，数据信息:
<class 'pandas.core.frame.DataFrame'>
Index: 95977 entries, 0 to 96469
Data columns (total 59 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   purchase_month           95977 non-null  int32  
 1   purchase_day_of_week     95977 non-null  int32  
 2   purchase_hour            95977 non-null  int32  
 3   is_same_state            95977 non-null  int64  
 4   total_weight_g           95977 non-null  float64
 5   total_volume_cm3         95977 non-null  float64
 6   total_payment_value      95977 non-null  float64
 7   total_order_items        95977 non-null  float64
 8   payment_processing_time  95977 non-null  float64
 9   seller_dispatch_time     95977 non-null  float64
 10  carrier_delivery_time    95977 non-null  float64
 11  distance_km              95977 non-null  float64
 12  customer_state_AL        95977 non-null  float64
 13  cus

In [14]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import mlflow
import mlflow.lightgbm
mlflow.set_experiment("Olist_Fulfillment_Time_Prediction")
with mlflow.start_run(run_name="Baseline_LightGBM_Manual_Features"):
    print("\n--- 正在召唤‘LightGBM’，并，在‘MLFlow的监督’下，进行‘学习’... ---")
    params = {
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'random_state': 42}
    mlflow.log_params(params)
    lgbm = LGBMRegressor(**params)
    lgbm.fit(X_train, y_train)
    print("--- “模型”已，完成“学习”！ ---")
    print("\n--- 正在，对‘测试集’，进行‘预测’... ---")
    y_pred = lgbm.predict(X_test)
    print("\n--- “基准模型”性能评估报告 ---")
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    print(f"平均绝对误差 (MAE): {mae:.4f} 天")
    print(f"均方误差 (MSE): {mse:.4f}")
    print(f"均方根误差 (RMSE): {rmse:.4f} 天")
    mlflow.lightgbm.log_model(lgbm, "baseline_lgbm_model")
    print("\n--- MLFlow记录仪式，已，全部，完成！---")
    print(f"本次实验的所有‘参数’、‘性能’与‘模型’，都，已，被，忠实地，记录在案！")
    print(f"您，可以，在‘命令行’中，输入‘mlflow ui’，来，启动‘可视化界面’，进行‘查阅’！")

2025/08/06 03:54:20 INFO mlflow.tracking.fluent: Experiment with name 'Olist_Fulfillment_Time_Prediction' does not exist. Creating a new experiment.



--- 正在召唤‘LightGBM’，并，在‘MLFlow的监督’下，进行‘学习’... ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002103 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1930
[LightGBM] [Info] Number of data points in the train set: 76781, number of used features: 55
[LightGBM] [Info] Start training from score 12.551481




--- “模型”已，完成“学习”！ ---

--- 正在，对‘测试集’，进行‘预测’... ---

--- “基准模型”性能评估报告 ---
平均绝对误差 (MAE): 0.2558 天
均方误差 (MSE): 4.9601
均方根误差 (RMSE): 2.2271 天





--- MLFlow记录仪式，已，全部，完成！---
本次实验的所有‘参数’、‘性能’与‘模型’，都，已，被，忠实地，记录在案！
您，可以，在‘命令行’中，输入‘mlflow ui’，来，启动‘可视化界面’，进行‘查阅’！


惊人的结果，这意味着我们的初步模型已经足够优秀，和实际预测值仅有六小时误差

In [16]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
input_dim = X_train_scaled.shape[1]
encoding_dim = 10
input_layer = Input(shape=(input_dim,), name="Encoder_Input")
encoder_layer_1 = Dense(32, activation='relu', name="Encoder_Layer_1")(input_layer)
bottleneck_layer = Dense(encoding_dim, activation='relu', name="Bottleneck")(encoder_layer_1)
decoder_layer_1 = Dense(32, activation='relu', name="Decoder_Layer_1")(bottleneck_layer)
output_layer = Dense(input_dim, activation='sigmoid', name="Decoder_Output")(decoder_layer_1)
autoencoder = Model(inputs=input_layer, outputs=output_layer, name="Autoencoder")
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
print("--- “自编码器”网络结构预览 ---")
autoencoder.summary()
print("\n--- 正在训练“自编码器”... 这，可能，需要，几分钟的时间... ---")
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
autoencoder.fit(
    X_train_scaled, X_train_scaled,
    epochs=100,
    batch_size=32,
    shuffle=True,
    validation_data=(X_test_scaled, X_test_scaled),
    callbacks=[early_stopping],
    verbose=1)
print("\n--- “自编码器”已，完成“训练”！ ---")

--- “自编码器”网络结构预览 ---



--- 正在训练“自编码器”... 这，可能，需要，几分钟的时间... ---
Epoch 1/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 917us/step - loss: 0.0165 - val_loss: 0.0060
Epoch 2/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 838us/step - loss: 0.0046 - val_loss: 0.0040
Epoch 3/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 836us/step - loss: 0.0036 - val_loss: 0.0034
Epoch 4/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 852us/step - loss: 0.0033 - val_loss: 0.0033
Epoch 5/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 846us/step - loss: 0.0033 - val_loss: 0.0033
Epoch 6/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 849us/step - loss: 0.0033 - val_loss: 0.0033
Epoch 7/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 854us/step - loss: 0.0032 - val_loss: 0.0032
Epoch 8/100
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 854u

In [18]:
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('Bottleneck').output,name="Encoder")
print("--- “编码器（特征提取器）”网络结构预览 ---")
encoder.summary()
X_train_encoded_features = encoder.predict(X_train_scaled)
X_test_encoded_features = encoder.predict(X_test_scaled)
print("\n--- “神级特征”已，成功，提取！ ---")
print(f"原始训练集特征维度: {X_train_scaled.shape}")
print(f"提取后的神级特征维度: {X_train_encoded_features.shape}")
print("\n神级特征（前5行）预览:")
import pandas as pd
print(pd.DataFrame(X_train_encoded_features).head())

--- “编码器（特征提取器）”网络结构预览 ---


[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 413us/step
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441us/step

--- “神级特征”已，成功，提取！ ---
原始训练集特征维度: (76781, 59)
提取后的神级特征维度: (76781, 10)

神级特征（前5行）预览:
           0         1         2          3          4    5          6  \
0   2.026418  8.920732  3.599184  10.311538  10.316717  0.0   9.350719   
1   4.240238  4.772567  7.750415   7.243033   4.772855  0.0  10.204630   
2  11.111782  8.507308  6.096081   5.641565   8.534621  0.0  14.207093   
3   3.785465  7.779017  5.559419  12.288781   8.915378  0.0   5.787151   
4  10.255264  6.155926  1.358073  11.344276   8.120808  0.0   7.956497   

           7         8          9  
0  10.675953  8.727247   4.805638  
1   4.269438  1.291077  23.755634  
2   8.841950  4.070943   1.706980  
3  13.122069  9.165938   1.698428  
4   6.325791  3.755145   4.562996  


In [19]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import mlflow
import mlflow.lightgbm
df_X_train_encoded_features = pd.DataFrame(X_train_encoded_features, index=X_train.index,
                                           columns=[f"autoencoder_{i}" for i in range(10)])
df_X_test_encoded_features = pd.DataFrame(X_test_encoded_features, index=X_test.index, columns=[
                                           f"autoencoder_{i}" for i in range(10)])
X_train_ultimate = pd.concat([X_train, df_X_train_encoded_features], axis=1)
X_test_ultimate = pd.concat([X_test, df_X_test_encoded_features], axis=1)
print("--- “究极特征军团”，已，组建完毕！ ---")
print(f"究极训练集维度: {X_train_ultimate.shape}")
with mlflow.start_run(run_name="LGBM_With_Autoencoder_Features"):
    print("\n--- 正在召唤‘王者模型’，并，开始，最终的‘学习’... ---")
    params = {
        'n_estimators': 1000,
        'learning_rate': 0.03,
        'num_leaves': 41,
        'random_state': 42
    }
    mlflow.log_params(params)
    mlflow.log_param("features_used", "manual_and_autoencoder")
    lgbm_ultimate = LGBMRegressor(**params)
    lgbm_ultimate.fit(X_train_ultimate, y_train)
    print("--- “王者模型”已，完成“学习”！ ---")
    print("\n--- 正在，对‘测试集’，进行‘最终预测’... ---")
    y_pred_ultimate = lgbm_ultimate.predict(X_test_ultimate)
    print("\n--- “王者模型”性能评估报告 ---")
    mae_ultimate = mean_absolute_error(y_test, y_pred_ultimate)
    mlflow.log_metric("mae", mae_ultimate)
    print(f"王者模型的平均绝对误差 (MAE): {mae_ultimate:.4f} 天")
    baseline_mae = 0.2558
    improvement = ((baseline_mae - mae_ultimate) / baseline_mae) * 100
    print(f"\n--- 最终对决结果 ---")
    print(f"基准模型 MAE: {baseline_mae:.4f} 天")
    print(f"王者模型 MAE: {mae_ultimate:.4f} 天")
    print(f"性能提升百分比: {improvement:.2f}%！")
    mlflow.lightgbm.log_model(lgbm_ultimate, "ultimate_lgbm_model")

--- “究极特征军团”，已，组建完毕！ ---
究极训练集维度: (76781, 69)

--- 正在召唤‘王者模型’，并，开始，最终的‘学习’... ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4225
[LightGBM] [Info] Number of data points in the train set: 76781, number of used features: 64
[LightGBM] [Info] Start training from score 12.551481




--- “王者模型”已，完成“学习”！ ---

--- 正在，对‘测试集’，进行‘最终预测’... ---

--- “王者模型”性能评估报告 ---
王者模型的平均绝对误差 (MAE): 0.2179 天

--- 最终对决结果 ---
基准模型 MAE: 0.2558 天
王者模型 MAE: 0.2179 天
性能提升百分比: 14.80%！




我们把6小时的误差通过深度学习特征提取压低到了5.2小时，精度再次提升！