In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# 加载训练数据
df_train = pd.read_csv("dataframe_train.csv")

# 选择初始特征
selected_features = ['courier_id', 'wave_index', 'courier_wave_start_lng',
       'courier_wave_start_lat', 'date', 'group', 'level',
       'speed', 'max_load', 'weather_grade', 'aoi_id', 'shop_id',
       'source_type', 'source_tracking_id', 'source_lng', 'source_lat',
       'target_lng', 'target_lat', 'grid_distance', 
       'urgency']

# 提取特征和目标变量
X = df_train[selected_features]
y = df_train['expected_use_time']  

In [12]:
def pre(X):
    X['date'] = pd.to_datetime(X['date'], format='%Y%m%d')
    
    # 提取日期特征
    X['day_of_week'] = X['date'].dt.dayofweek  # 星期几（0=周一，6=周日）
    X['is_weekend'] = X['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)  # 是否周末
    
    # 删除原始日期列
    X = X.drop(columns=['date'])
    # 计算每个 aoi_id 的出现频率
    aoi_freq = X['aoi_id'].value_counts(normalize=True).to_dict()
    X['aoi_freq'] = X['aoi_id'].map(aoi_freq)
    
    # 计算每个 shop_id 的出现频率
    shop_freq = X['shop_id'].value_counts(normalize=True).to_dict()
    X['shop_freq'] = X['shop_id'].map(shop_freq)
    
    # 删除原始高基数列
    X = X.drop(columns=['aoi_id', 'shop_id'])
    X = X.drop(columns=['courier_id'])
    
    # 定义天气等级的顺序（根据业务逻辑调整）
    weather_order = ['Normal Weather' ,'Slightly Bad Weather','Bad Weather' ,'Very Bad Weather']
    X['weather_grade'] = X['weather_grade'].map({k: v for v, k in enumerate(weather_order)})
    
    mapping = {
        'DELIVERY': 1,
        'PICKUP': 0,
        'ASSIGN':2
    }
    
    X['source_type']=X['source_type'].map(mapping)
    return X
X=pre(X)

In [4]:
df_sampled = pd.concat([X, y], axis=1).sample(n=40000, random_state=42)

# 重新分离特征和目标
X_sampled = df_sampled.drop(columns=['expected_use_time'])
y_sampled = df_sampled['expected_use_time']

In [15]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # 假设 'order' 列不需要标准化

# 划分训练集和测试集（80% 训练集，20% 测试集）
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 初始化随机森林回归器
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

# 在训练集上训练模型
rf_regressor.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = rf_regressor.predict(X_test)

# 使用 MAE 评估模型性能
mae = mean_absolute_error(y_test, y_pred)
print("平均绝对误差 (MAE):", mae)  

In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt



# 标准化数值特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  

# 划分训练集和测试集（80% 训练集，20% 测试集）
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 初始化 XGBoost 回归器
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# 在训练集上训练模型
xgb_regressor.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = xgb_regressor.predict(X_test)

# 使用 MAE 评估模型性能
mae = mean_absolute_error(y_test, y_pred)
print("平均绝对误差 (MAE):", mae)  # MAE 越小，表示模型预测越准确


In [20]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 初始化 XGBoost 回归器
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=300,
    learning_rate=0.01,
    max_depth=5,
    min_child_weight=10,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# 在训练集上训练模型
xgb_regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)],  verbose=True)

# 在测试集上进行预测
y_pred = xgb_regressor.predict(X_test)

# 使用 MAE 评估模型性能
mae = mean_absolute_error(y_test, y_pred)
print("平均绝对误差 (MAE):", mae)

In [7]:
df_test = pd.read_csv("dataframe_test.csv")
df_test = pre(df_test[selected_features])

In [21]:
pred_test = xgb_regressor.predict(df_test)

In [22]:
order_test = [i for i in range(len(df_test))]

# 创建结果 DataFrame
result_df = pd.DataFrame({
    'order': order_test,
    'expected_use_time': pred_test,
})

# 按照 order 递增排序
result_df = result_df.sort_values(by='order')

# 保存到 CSV 文件
result_df.to_csv('prediction_results1.csv', index=False)