In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore") 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [16]:
data=pd.read_excel(r'training.xlsx')
data.head()


Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region,Vehicle Population
0,2020,BS,Not Applicable,Gasoline,2023.0,ICE,Not Applicable,1,Statewide,1
1,2019,T3,Unknown,Gasoline,2022.0,ICE,Not Applicable,2,Statewide,1
2,2020,MH,Not Applicable,Gasoline,2023.0,ICE,Not Applicable,2,Statewide,1
3,2019,BS,Not Applicable,Diesel,2022.0,ICE,Not Applicable,≥4,Statewide,26
4,2019,MH,Not Applicable,Gasoline,2022.0,ICE,Not Applicable,≥4,Statewide,55


In [17]:
df = pd.DataFrame(data)

In [18]:
X = df.drop(columns=['Vehicle Population'])  
y = df['Vehicle Population']  


In [19]:
scoring_data = pd.read_excel(r'scoring.xlsx')

In [20]:
categorical_features = ['Fuel Technology', 'Fuel Type','Model Year', 'Electric Mile Range','Date', 'Vehicle Category','GVWR Class','Number of Vehicles Registered at the Same Address','Region']
# numeric_features = ['age']
X[categorical_features] = X[categorical_features].astype(str)

preprocessor = ColumnTransformer(
    transformers=[
        # ('num', 'passthrough', numeric_features),  # keep numerical features 
         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # encode categorical features
    ]
)

# 4. model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # data preprocessing
    ('regressor', DecisionTreeRegressor(random_state=42))  # model
])

# 5.split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, y_train = X,y


In [21]:
X_train

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region
35746,2019,T3,2,Diesel,1981.0,ICE,Not Applicable,1,Statewide
32643,2021,T3,2,Gasoline,1988.0,ICE,Not Applicable,3,Statewide
21317,2021,P,Not Applicable,Gasoline,2002.0,ICE,Not Applicable,2,Statewide
22883,2022,MC,Not Applicable,Diesel,2001.0,ICE,Not Applicable,1,Statewide
12902,2020,MC,Not Applicable,Gasoline,2010.0,ICE,Not Applicable,3,Statewide
...,...,...,...,...,...,...,...,...,...
6265,2023,T6,6,Diesel,2020.0,ICE,Not Applicable,≥4,Statewide
11284,2021,T4,2,Gasoline,2013.0,ICE,Not Applicable,≥4,Statewide
38158,2020,T1,1,Diesel,1979.0,ICE,Not Applicable,≥4,Statewide
860,2022,T7,8,Natural Gas,2023.0,ICE,Not Applicable,1,Statewide


In [22]:
y_train

35746        1
32643      257
21317    45906
22883        1
12902     8585
         ...  
6265      3540
11284     9634
38158        2
860         25
15795        1
Name: Vehicle Population, Length: 32842, dtype: int64

In [23]:

# 6. train
model.fit(X_train, y_train)

# 7. evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE on test set: {rmse}')

# 8. 

scoring_df = pd.DataFrame(scoring_data)
scoring_df[categorical_features] = scoring_df[categorical_features].astype(str)
predictions = model.predict(scoring_df)


submission_data = {
    'Predictions': predictions  
}
submission_df = pd.DataFrame(submission_data)


# submission_df.to_excel('submission_file.xlsx', index=False, sheet_name='Predictions')

print("Predictions saved to submission_file.xlsx")

RMSE on test set: 6666.739887772301
Predictions saved to submission_file.xlsx


In [33]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f'R^2 on test set: {r2:.2f}')

R^2 on test set: 0.89


In [24]:
feature_names = list(model.named_steps['preprocessor']
                                       .named_transformers_['cat']
                                       .get_feature_names_out(categorical_features))
importances = model.named_steps['regressor'].feature_importances_


for feature, importance in zip(feature_names, importances):
    print(f'{feature}: {importance:.4f}')


Fuel Technology_BEV: 0.0000
Fuel Technology_FCEV: 0.0001
Fuel Technology_ICE: 0.0719
Fuel Technology_PHEV: 0.0080
Fuel Type_Diesel: 0.0002
Fuel Type_Electric: 0.0000
Fuel Type_Gasoline: 0.2064
Fuel Type_Hydrogen: 0.0000
Fuel Type_Natural Gas: 0.0000
Fuel Type_Unknown: 0.0000
Model Year_1975.0: 0.0018
Model Year_1976.0: 0.0012
Model Year_1977.0: 0.0002
Model Year_1978.0: 0.0006
Model Year_1979.0: 0.0001
Model Year_1980.0: 0.0000
Model Year_1981.0: 0.0000
Model Year_1982.0: 0.0000
Model Year_1983.0: 0.0000
Model Year_1984.0: 0.0000
Model Year_1985.0: 0.0000
Model Year_1986.0: 0.0000
Model Year_1987.0: 0.0000
Model Year_1988.0: 0.0000
Model Year_1989.0: 0.0000
Model Year_1990.0: 0.0000
Model Year_1991.0: 0.0000
Model Year_1992.0: 0.0000
Model Year_1993.0: 0.0001
Model Year_1994.0: 0.0001
Model Year_1995.0: 0.0002
Model Year_1996.0: 0.0002
Model Year_1997.0: 0.0003
Model Year_1998.0: 0.0004
Model Year_1999.0: 0.0005
Model Year_2000.0: 0.0012
Model Year_2001.0: 0.0014
Model Year_2002.0: 0.0

In [25]:
import numpy as np
import pandas as pd


onehot_feature_names = list(model.named_steps['preprocessor']
                                       .named_transformers_['cat']
                                       .get_feature_names_out(categorical_features))

feature_names =  onehot_feature_names
importances = model.named_steps['regressor'].feature_importances_

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df['Original Feature'] = importance_df['Feature'].apply(lambda x: x.split('_')[0] if '_' in x else x)
importance_summary = importance_df.groupby('Original Feature')['Importance'].sum().reset_index()

importance_summary = importance_summary.sort_values(by='Importance', ascending=False)
print(importance_summary)


                                    Original Feature  Importance
5                                         Model Year    0.465689
3                                          Fuel Type    0.206709
8                                   Vehicle Category    0.086627
2                                    Fuel Technology    0.080081
0                                               Date    0.072865
6  Number of Vehicles Registered at the Same Address    0.060953
4                                         GVWR Class    0.022217
1                                Electric Mile Range    0.004859
7                                             Region    0.000000


In [26]:
baseline_pred = np.full_like(y_test, y_train.mean())
baseline_rmse = mean_squared_error(y_test, baseline_pred, squared=False)
print(f'Baseline RMSE: {baseline_rmse}')

# compare with baseline
if rmse < baseline_rmse:
    print("our model is better than the baseline.")
else:
    print("the baseline is better than our model.")

Baseline RMSE: 20342.63507166346
our model is better than the baseline.


In [27]:
mean_vehicles = df['Vehicle Population'].mean()
std_vehicles = df['Vehicle Population'].std()

print(f'the mean of Vehicle Population: {mean_vehicles:.2f}')
print(f'the std of Vehicle Population: {std_vehicles:.2f}')

the mean of Vehicle Population: 3463.93
the std of Vehicle Population: 18833.84


# use scoring.csv to evaluate

In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# 1. 
submission_df = pd.read_excel('submission_file.xlsx', sheet_name='Predictions')
predictions = submission_df['Predictions']  

# 2. 
scoring_df = pd.read_excel('scoring.xlsx')
true_values = scoring_df['Vehicle Population']  

# 3. RMSE
rmse = mean_squared_error(true_values, predictions, squared=False)
print(f'RMSE: {rmse:.2f}')

RMSE: 8001.78


# Random forest v.s. Gradient boosting
  

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. 
# random forest
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),  
    ('regressor', RandomForestRegressor(random_state=42)) 
])

# gradient boosting
gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),  
    ('regressor', GradientBoostingRegressor(random_state=42))  
])

# 5. 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)
print(f'rf RMSE: {rf_rmse:.2f}')

# 7. 
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_rmse = mean_squared_error(y_test, gb_y_pred, squared=False)
print(f'gb RMSE: {gb_rmse:.2f}')

# 8. 
scoring_df[categorical_features] = scoring_df[categorical_features].astype(str)

# 9. 
if rf_rmse < gb_rmse:
    best_model = rf_model
    print("choose random forest model for prediction.")
else:
    best_model = gb_model
    print("choose gradient boosting model for prediction.")

predictions = best_model.predict(scoring_df)

# 10. 
submission_data_ = {
    'Predictions': predictions  # 预测结果
}
submission_df = pd.DataFrame(submission_data_)

# 
# submission_df.to_excel('submission_file_updated.xlsx', index=False, sheet_name='Predictions')

print("Predictions saved to submission_file_updated.xlsx")

rf RMSE: 6349.73
gb RMSE: 14501.63
choose random forest model for prediction.
Predictions saved to submission_file_updated.xlsx


In [32]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, rf_y_pred)
print(f'R^2 on test set: {r2:.2f}')

R^2 on test set: 0.90


# Feature Importance

In [34]:
import numpy as np
import pandas as pd


onehot_feature_names = list(rf_model.named_steps['preprocessor']
                                       .named_transformers_['cat']
                                       .get_feature_names_out(categorical_features))

feature_names = onehot_feature_names

importances = rf_model.named_steps['regressor'].feature_importances_
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

importance_df['Original Feature'] = importance_df['Feature'].apply(lambda x: x.split('_')[0] if '_' in x else x)
importance_summary = importance_df.groupby('Original Feature')['Importance'].sum().reset_index()

importance_summary = importance_summary.sort_values(by='Importance', ascending=False)
print(importance_summary)


                                    Original Feature  Importance
5                                         Model Year    0.480187
3                                          Fuel Type    0.207300
8                                   Vehicle Category    0.087046
0                                               Date    0.064794
6  Number of Vehicles Registered at the Same Address    0.055943
1                                Electric Mile Range    0.041487
2                                    Fuel Technology    0.041202
4                                         GVWR Class    0.022041
7                                             Region    0.000000


# Evaluate the model using Scoring.csv

In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

submission_df = pd.read_excel('submission_file_updated.xlsx', sheet_name='Predictions')
predictions = submission_df['Predictions']  # 预测值


scoring_df = pd.read_excel('scoring.xlsx')
true_values = scoring_df['Vehicle Population']  # 真实值

# 3.  RMSE
rmse = mean_squared_error(true_values, predictions, squared=False)
print(f'RMSE: {rmse:.2f}')

RMSE: 7573.58
