In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore") #过滤掉警告的意思

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
data=pd.read_excel(r'training.xlsx')
data.head()


Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region,Vehicle Population
0,2020,BS,Not Applicable,Gasoline,2023.0,ICE,Not Applicable,1,Statewide,1
1,2019,T3,Unknown,Gasoline,2022.0,ICE,Not Applicable,2,Statewide,1
2,2020,MH,Not Applicable,Gasoline,2023.0,ICE,Not Applicable,2,Statewide,1
3,2019,BS,Not Applicable,Diesel,2022.0,ICE,Not Applicable,≥4,Statewide,26
4,2019,MH,Not Applicable,Gasoline,2022.0,ICE,Not Applicable,≥4,Statewide,55


In [6]:
def clean(data):
    df = pd.DataFrame(data)
    df.columns = df.columns.str.strip().str.lower().str.replace(r'\W+', '_', regex=True)
    df.drop(columns=['region'], inplace=True)
    df['model_year'] = df.groupby(['fuel_type', 'gvwr_class'])['model_year'].transform(lambda x: x.fillna(x.median())).astype(int)
    df['age'] = df['date']-df['model_year']
    
    fossil_fuel_types = ['Gasoline', 'Diesel', 'Natural Gas']
    electric_hydrogen_types = ['Electric', 'Hydrogen']
    df['fuel_type_'] = df['fuel_type'].apply(lambda x: 0 if x in fossil_fuel_types else (1 if x in electric_hydrogen_types else None))

    return df

In [7]:
df = clean(data)
df.head()

Unnamed: 0,date,vehicle_category,gvwr_class,fuel_type,model_year,fuel_technology,electric_mile_range,number_of_vehicles_registered_at_the_same_address,vehicle_population,age,fuel_type_
0,2020,BS,Not Applicable,Gasoline,2023,ICE,Not Applicable,1,1,-3,0.0
1,2019,T3,Unknown,Gasoline,2022,ICE,Not Applicable,2,1,-3,0.0
2,2020,MH,Not Applicable,Gasoline,2023,ICE,Not Applicable,2,1,-3,0.0
3,2019,BS,Not Applicable,Diesel,2022,ICE,Not Applicable,≥4,26,-3,0.0
4,2019,MH,Not Applicable,Gasoline,2022,ICE,Not Applicable,≥4,55,-3,0.0


In [8]:
X = df.drop(columns=['vehicle_population'])  # 特征
y = df['vehicle_population']  # 目标变量


In [9]:
scoring_data = pd.read_excel(r'scoring.xlsx')
scoring_data = clean(scoring_data)

In [11]:
categorical_features = ['fuel_technology', 'fuel_type','model_year', 'electric_mile_range','date', 'vehicle_category','gvwr_class','number_of_vehicles_registered_at_the_same_address','fuel_type_']
numeric_features = ['age']
X[categorical_features] = X[categorical_features].astype(str)
X[numeric_features] = X[numeric_features].astype(int)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),  # 数值特征直接保留
         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # 分类特征进行独热编码
    ]
)

# 4. 创建决策树回归模型
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 数据预处理
    ('regressor', DecisionTreeRegressor(random_state=42))  # 决策树回归
])

# 5. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, y_train = X,y


In [12]:
X_train

Unnamed: 0,date,vehicle_category,gvwr_class,fuel_type,model_year,fuel_technology,electric_mile_range,number_of_vehicles_registered_at_the_same_address,age,fuel_type_
35746,2019,T3,2,Diesel,1981,ICE,Not Applicable,1,38,0.0
32643,2021,T3,2,Gasoline,1988,ICE,Not Applicable,3,33,0.0
21317,2021,P,Not Applicable,Gasoline,2002,ICE,Not Applicable,2,19,0.0
22883,2022,MC,Not Applicable,Diesel,2001,ICE,Not Applicable,1,21,0.0
12902,2020,MC,Not Applicable,Gasoline,2010,ICE,Not Applicable,3,10,0.0
...,...,...,...,...,...,...,...,...,...,...
6265,2023,T6,6,Diesel,2020,ICE,Not Applicable,≥4,3,0.0
11284,2021,T4,2,Gasoline,2013,ICE,Not Applicable,≥4,8,0.0
38158,2020,T1,1,Diesel,1979,ICE,Not Applicable,≥4,41,0.0
860,2022,T7,8,Natural Gas,2023,ICE,Not Applicable,1,-1,0.0


In [13]:
y_train

35746        1
32643      257
21317    45906
22883        1
12902     8585
         ...  
6265      3540
11284     9634
38158        2
860         25
15795        1
Name: vehicle_population, Length: 32842, dtype: int64

In [15]:

# 6. 训练模型
model.fit(X_train, y_train)

# 7. 评估模型（可选）
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE on test set: {rmse}')

# 8. 加载评分数据（scoring.csv）
# 假设评分数据已经加载到 DataFrame `scoring_df` 中
# 这里用示例数据代替，你需要替换为实际的 `scoring.csv`

scoring_df = pd.DataFrame(scoring_data)
scoring_df[categorical_features] = scoring_df[categorical_features].astype(str)
# 9. 预测车辆数目
predictions = model.predict(scoring_df)

# 10. 将预测结果保存到 submission_file.xlsx
# 假设 submission_format.csv 已经加载到 DataFrame `submission_df` 中
# 这里用示例数据代替，你需要替换为实际的 `submission_format.csv`
submission_data = {
    'Predictions': predictions  # 预测结果
}
submission_df = pd.DataFrame(submission_data)

# 保存为 Excel 文件
submission_df.to_excel('submission_file_class.xlsx', index=False, sheet_name='Predictions')

print("Predictions saved to submission_file.xlsx")

RMSE on test set: 5096.523642304908
Predictions saved to submission_file.xlsx


In [16]:
baseline_pred = np.full_like(y_test, y_train.mean())
baseline_rmse = mean_squared_error(y_test, baseline_pred, squared=False)
print(f'Baseline RMSE: {baseline_rmse}')

# 比较模型 RMSE 和基线 RMSE
if rmse < baseline_rmse:
    print("模型性能优于基线模型。")
else:
    print("模型性能需要改进。")

Baseline RMSE: 20342.63507166346
模型性能优于基线模型。


In [17]:
mean_vehicles = df['vehicle_population'].mean()
std_vehicles = df['vehicle_population'].std()

print(f'目标变量 Vehicle Population 的平均值: {mean_vehicles:.2f}')
print(f'目标变量 Vehicle Population 的标准差: {std_vehicles:.2f}')

目标变量 Vehicle Population 的平均值: 3463.93
目标变量 Vehicle Population 的标准差: 18833.84


- 看一下在scoring上的效果

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# 1. 加载预测结果
# 假设预测结果已经保存在 submission_file.xlsx 的 Predictions 工作表中
submission_df = pd.read_excel('submission_file_class.xlsx', sheet_name='Predictions')
predictions = submission_df['Predictions']  # 预测值

# 2. 加载真实值
# 假设真实值已经保存在 scoring.csv 中
scoring_df = pd.read_excel('scoring.xlsx')
scoring_df=clean(scoring_df)
true_values = scoring_df['vehicle_population']  # 真实值


# 3. 计算 RMSE
rmse = mean_squared_error(true_values, predictions, squared=False)
print(f'RMSE: {rmse:.2f}')

RMSE: 5381.77


In [22]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f'R^2 on test set: {r2:.2f}')

R^2 on test set: 0.94


In [24]:
import numpy as np
import pandas as pd

# 获取 one-hot 编码后的特征名
onehot_feature_names = list(model.named_steps['preprocessor']
                                       .named_transformers_['cat']
                                       .get_feature_names_out(categorical_features))

# 获取所有特征名（数值特征 + one-hot 之后的特征）
feature_names = numeric_features + onehot_feature_names

# 获取特征重要性
importances = model.named_steps['regressor'].feature_importances_

# 组装成 DataFrame 方便分析
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# 处理 one-hot 编码的特征，把相同原始特征的 one-hot 重要性相加
importance_df['Original Feature'] = importance_df['Feature'].apply(lambda x: x.split('_')[0] if '_' in x else x)
importance_summary = importance_df.groupby('Original Feature')['Importance'].sum().reset_index()

# 按重要性排序
importance_summary = importance_summary.sort_values(by='Importance', ascending=False)

# 打印最终特征重要性
print(importance_summary)


  Original Feature  Importance
3             fuel    0.434802
0              age    0.322212
7          vehicle    0.100642
6           number    0.060266
5            model    0.024691
4             gvwr    0.022249
2         electric    0.017640
1             date    0.017497


- 试一下随机森林
  

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. 加载训练数据
# 假设数据已经加载到 DataFrame `df` 中
# 这里用示例数据代替，你需要替换为实际的 `training.csv`

# 随机森林回归模型
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 数据预处理
    ('regressor', RandomForestRegressor(random_state=42))  # 随机森林回归
])

# 梯度提升树回归模型
gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 数据预处理
    ('regressor', GradientBoostingRegressor(random_state=42))  # 梯度提升树回归
])

# 5. 划分训练集和测试集
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 训练和评估随机森林模型
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)
print(f'随机森林 RMSE: {rf_rmse:.2f}')

# 7. 训练和评估梯度提升树模型
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_rmse = mean_squared_error(y_test, gb_y_pred, squared=False)
print(f'梯度提升树 RMSE: {gb_rmse:.2f}')

# 8. 加载评分数据（scoring.csv）
# 假设评分数据已经加载到 DataFrame `scoring_df` 中
# 这里用示例数据代替，你需要替换为实际的 `scoring.csv`
# 确保评分数据的分类特征是字符串类型
scoring_df[categorical_features] = scoring_df[categorical_features].astype(str)

# 9. 使用最佳模型预测车辆数目
# 选择 RMSE 较小的模型
if rf_rmse < gb_rmse:
    best_model = rf_model
    print("选择随机森林模型进行预测。")
else:
    best_model = gb_model
    print("选择梯度提升树模型进行预测。")

predictions = best_model.predict(scoring_df)

# 10. 将预测结果保存到 submission_file.xlsx
# 假设 submission_format.csv 已经加载到 DataFrame `submission_df` 中
# 这里用示例数据代替，你需要替换为实际的 `submission_format.csv`
submission_data_ = {
    'Predictions': predictions  # 预测结果
}
submission_df = pd.DataFrame(submission_data_)

# 保存为 Excel 文件
submission_df.to_excel('submission_file_updated_class.xlsx', index=False, sheet_name='Predictions')

print("Predictions saved to submission_file_updated_class.xlsx")

随机森林 RMSE: 3499.65
梯度提升树 RMSE: 11223.15
选择随机森林模型进行预测。
Predictions saved to submission_file_updated_class.xlsx


- 对比scoring真实数据

In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# 1. 加载预测结果
# 假设预测结果已经保存在 submission_file.xlsx 的 Predictions 工作表中
submission_df = pd.read_excel('submission_file_updated_class.xlsx', sheet_name='Predictions')
predictions = submission_df['Predictions']  # 预测值

# 2. 加载真实值
# 假设真实值已经保存在 scoring.csv 中
scoring_df = pd.read_excel('scoring.xlsx')
true_values = scoring_df['Vehicle Population']  # 真实值

# 3. 计算 RMSE
rmse = mean_squared_error(true_values, predictions, squared=False)
print(f'RMSE: {rmse:.2f}')

RMSE: 4738.51


In [25]:
import numpy as np
import pandas as pd

# 获取 one-hot 编码后的特征名
onehot_feature_names = list(rf_model.named_steps['preprocessor']
                                       .named_transformers_['cat']
                                       .get_feature_names_out(categorical_features))

# 获取所有特征名（数值特征 + one-hot 之后的特征）
feature_names = numeric_features + onehot_feature_names

# 获取特征重要性
importances = rf_model.named_steps['regressor'].feature_importances_

# 组装成 DataFrame 方便分析
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# 处理 one-hot 编码的特征，把相同原始特征的 one-hot 重要性相加
importance_df['Original Feature'] = importance_df['Feature'].apply(lambda x: x.split('_')[0] if '_' in x else x)
importance_summary = importance_df.groupby('Original Feature')['Importance'].sum().reset_index()

# 按重要性排序
importance_summary = importance_summary.sort_values(by='Importance', ascending=False)

# 打印最终特征重要性
print(importance_summary)


  Original Feature  Importance
3             fuel    0.344023
0              age    0.314537
2         electric    0.117266
7          vehicle    0.100629
6           number    0.058834
5            model    0.026141
4             gvwr    0.022151
1             date    0.016419
