## Initialization

In [2]:
import numpy as np
import pandas as pd
import torch

In [3]:
data = pd.read_csv('training.csv')
data.describe()

Unnamed: 0,Date,Model Year,Vehicle Population
count,41053.0,40450.0,41053.0
mean,2020.985312,2002.075871,3463.929506
std,1.39847,13.462658,18833.839302
min,2019.0,1975.0,1.0
25%,2020.0,1991.0,5.0
50%,2021.0,2003.0,39.0
75%,2022.0,2014.0,390.0
max,2023.0,2026.0,395883.0


In [4]:
def clean(data):
    data = data.dropna(subset=['Number of Vehicles Registered at the Same Address'], inplace=False)

    data["time_idx"] = data["Date"] - 2018
    data["Model Year"] = data["Model Year"].astype('Int64')
    data["Model Year"] = data["Model Year"].fillna(value=1900, inplace=False)
    data["Model Year"] = data["Model Year"].astype(str)
    # data = data.fillna(value={'Model Year': "Unknown"}, inplace=False)
    data["Vehicle Population"] = data["Vehicle Population"].astype(float)
    
    # data["new"] = data["Date"] - data["Model Year"]
    # data = data.fillna(value={'new': 45}, inplace=False)

    # data["Fuel"] = data["Fuel Type"] + "_" + data["Fuel Technology"]
    # data = data.drop('Fuel Type', axis=1)
    # data = data.drop('Fuel Technology', axis=1)

    # data["Vehicle Category"] = data["Vehicle Category"] + ("_" + data["GVWR Class"]).where(
    #     data["GVWR Class"] != "Not Applicable", 
    #     other=""
    # )
    # data = data.drop('GVWR Class', axis=1)

    data = data.drop('Region', axis=1)
    data = data.drop('Date', axis=1)

    return data

In [5]:
def clean_combined(data):
    data = data.dropna(subset=['Number of Vehicles Registered at the Same Address'], inplace=False)

    data["new"] = data["Date"] - data["Model Year"]
    data["time_idx"] = data["Date"] - 2018
    data["Model Year"] = data["Model Year"].astype('Int64')
    data["Model Year"] = data["Model Year"].fillna(value=1900, inplace=False)
    data["Model Year"] = data["Model Year"].astype(str)
    # data = data.fillna(value={'Model Year': "Unknown"}, inplace=False)
    data["Vehicle Population"] = data["Vehicle Population"].astype(float)

    data["Fuel"] = data["Fuel Type"] + "_" + data["Fuel Technology"]
    data = data.drop('Fuel Type', axis=1)
    data = data.drop('Fuel Technology', axis=1)

    data["Vehicle Category"] = data["Vehicle Category"] + ("_" + data["GVWR Class"]).where(
        data["GVWR Class"] != "Not Applicable", 
        other=""
    )
    data = data.drop('GVWR Class', axis=1)

    data = data.drop('Region', axis=1)
    data = data.drop('Date', axis=1)

    return data

In [6]:
train_data = pd.read_csv('training.csv')
train_data = clean_combined(train_data)
train_data.head(20)

Unnamed: 0,Vehicle Category,Model Year,Electric Mile Range,Number of Vehicles Registered at the Same Address,Vehicle Population,new,time_idx,Fuel
0,P,2020,Not Applicable,≥4,395883.0,-1.0,1,Gasoline_ICE
1,P,2020,Not Applicable,1,370954.0,0.0,2,Gasoline_ICE
2,P,2020,Not Applicable,1,349406.0,1.0,3,Gasoline_ICE
3,P,2019,Not Applicable,≥4,348475.0,0.0,1,Gasoline_ICE
4,P,2018,Not Applicable,≥4,333296.0,1.0,1,Gasoline_ICE
5,P,2021,Not Applicable,1,332578.0,0.0,3,Gasoline_ICE
6,P,2021,Not Applicable,1,329591.0,1.0,4,Gasoline_ICE
7,P,2021,Not Applicable,≥4,325602.0,0.0,3,Gasoline_ICE
8,P,2021,Not Applicable,≥4,324732.0,-1.0,2,Gasoline_ICE
9,P,2020,Not Applicable,≥4,322706.0,0.0,2,Gasoline_ICE


In [7]:
valid_data = pd.read_csv('scoring.csv')
valid_data = clean_combined(valid_data)
valid_data.describe()

Unnamed: 0,Vehicle Population,new,time_idx
count,7546.0,7440.0,7546.0
mean,3783.340445,19.105645,6.0
std,19467.716296,13.385406,0.0
min,1.0,-2.0,6.0
25%,4.0,7.0,6.0
50%,38.0,18.0,6.0
75%,384.0,30.0,6.0
max,316065.0,44.0,6.0


## Training

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [9]:
y = train_data["Vehicle Population"]
X = train_data.drop(columns=["Vehicle Population", "time_idx"])


X = X.replace("≥4", "4")
X = X.replace(">150 miles", "150 miles")

encoder = OneHotEncoder(drop="first", sparse_output=False)
X_encoded = encoder.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))
X_encoded.index = X.index
X_encoded["time_idx"] = train_data["time_idx"].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# test set
y_test = valid_data["Vehicle Population"]
X_test = valid_data.drop(columns=["Vehicle Population", "time_idx"])

X_test = X_test.replace("≥4", "4")
X_test = X_test.replace(">150 miles", "150 miles")

X_test_encoded = encoder.transform(X_test)

X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(X_test.columns))
X_test_encoded.index = X_test.index
X_test_encoded["time_idx"] = valid_data["time_idx"].astype(int)




In [None]:
xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror", 
    n_estimators=300, 
    learning_rate=0.1, 
    random_state=42
)


xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_valid)

# 计算 MSE 误差
rmse = root_mean_squared_error(y_valid, y_pred)
print(f"Validation RMSE: {rmse}")

y_pred = xgb_reg.predict(X_test_encoded)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Testing RMSE: {rmse}")

Validation RMSE: 4496.156242021308
Testing RMS: 5028.326576456106


### Hyperparameter Tuning

In [11]:
def estimators_tuning(n):

    xgb_reg = xgb.XGBRegressor(
            objective="reg:squarederror", 
            n_estimators=n, 
            learning_rate=0.1, 
            random_state=42
        )


    xgb_reg.fit(X_train, y_train)

    y_pred = xgb_reg.predict(X_valid)

    rmse = root_mean_squared_error(y_valid, y_pred)
    print(f"Validation RMSE: {rmse}")

    y_pred = xgb_reg.predict(X_test_encoded)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"Testing RMSE: {rmse}")

In [58]:
for i in range(200, 4000, 200):
    print(i)
    estimators_tuning(i)

200
Validation RMSE: 5013.685607489045
Testing RMSE: 5601.663699633834
400
Validation RMSE: 4271.631171730337
Testing RMSE: 4750.975298410478
600
Validation RMSE: 4018.5343649535625
Testing RMSE: 4472.090045313119
800
Validation RMSE: 3839.7694774142883
Testing RMSE: 4318.509505199218
1000
Validation RMSE: 3762.2931555359673
Testing RMSE: 4268.845378692976
1200
Validation RMSE: 3661.7607453974383
Testing RMSE: 4235.543989424033
1400
Validation RMSE: 3631.3943880203246
Testing RMSE: 4205.731408952108
1600
Validation RMSE: 3577.6316989075576
Testing RMSE: 4173.042166316038
1800
Validation RMSE: 3552.0875643581207
Testing RMSE: 4145.885390738533
2000
Validation RMSE: 3548.1380415524864
Testing RMSE: 4137.4564930816805
2200
Validation RMSE: 3528.802659925992
Testing RMSE: 4128.419357340212
2400
Validation RMSE: 3516.5021032937248
Testing RMSE: 4134.571653728798
2600
Validation RMSE: 3527.743736553915
Testing RMSE: 4154.40012485066
2800
Validation RMSE: 3531.2463441606847
Testing RMSE: 4156

In [12]:
def lr_tuning(lr):

    xgb_reg = xgb.XGBRegressor(
            objective="reg:squarederror", 
            n_estimators=2200, 
            learning_rate=lr, 
            random_state=42
        )


    xgb_reg.fit(X_train, y_train)

    y_pred = xgb_reg.predict(X_valid)

    rmse = root_mean_squared_error(y_valid, y_pred)
    print(f"Validation RMSE: {rmse}")

    y_pred = xgb_reg.predict(X_test_encoded)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"Testing RMSE: {rmse}")

In [19]:
for i in range(1, 31, 2):
    print(i/100.0)
    lr_tuning(i/100.0)

0.01
Validation RMSE: 5066.121309881912
Testing RMSE: 5705.348021413523
0.03
Validation RMSE: 3992.1009676789704
Testing RMSE: 4341.9043255711495
0.05
Validation RMSE: 3687.472485868722
Testing RMSE: 4148.424368002978
0.07
Validation RMSE: 3590.7544169859307
Testing RMSE: 4088.592153667384
0.09
Validation RMSE: 3640.7785354875805
Testing RMSE: 4149.371305869101
0.11
Validation RMSE: 3604.428651847425
Testing RMSE: 4150.715772198071
0.13
Validation RMSE: 3355.0123580553986
Testing RMSE: 4044.5839615708314
0.15
Validation RMSE: 3477.6805644385295
Testing RMSE: 4190.361719259014
0.17
Validation RMSE: 3584.7130018818325
Testing RMSE: 4074.2415129605433
0.19
Validation RMSE: 3641.3282408468235
Testing RMSE: 4074.220145271252
0.21
Validation RMSE: 3391.9095291183044
Testing RMSE: 3979.500896151266
0.23
Validation RMSE: 3508.18837455451
Testing RMSE: 4206.67401483019
0.25
Validation RMSE: 3520.4964105567624
Testing RMSE: 4143.740277148536
0.27
Validation RMSE: 3699.905051823357
Testing RMSE: 

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [i for i in range(3, 16, 2)],
    "min_child_weight": [i for i in range(1, 10, 2)]
}

grid_search = GridSearchCV(
    xgb.XGBRegressor(
        n_estimators=2400, 
        learning_rate=0.21
    ), param_grid, scoring="neg_mean_squared_error", cv=5)
grid_search.fit(X_encoded, y)

print(f"Best Params: {grid_search.best_params_}")


Best Params: {'max_depth': 15, 'min_child_weight': 3}


In [45]:
def depth_child_tuning(max_depth, min_child_weight):

    xgb_reg = xgb.XGBRegressor(
        objective="reg:squarederror", 
        n_estimators=2400, 
        learning_rate=0.21, 
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        random_state=42
    )


    xgb_reg.fit(X_train, y_train)

    y_pred = xgb_reg.predict(X_valid)

    rmse = root_mean_squared_error(y_valid, y_pred)
    print(f"Validation RMSE: {rmse}")

    y_pred = xgb_reg.predict(X_test_encoded)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"Testing RMSE: {rmse}")

In [47]:
for i in [3, 5, 7]:
    for j in [1, 3, 5]:
        print(i, j)
        depth_child_tuning(i, j)


3 1
Validation RMSE: 5411.467147737633
Testing RMS: 5945.803371655715
3 3
Validation RMSE: 5418.925219652216
Testing RMS: 5890.595035493807
3 5
Validation RMSE: 5587.377461067627
Testing RMS: 6098.194066590922
5 1
Validation RMSE: 3287.446798683518
Testing RMS: 3965.2961224384344
5 3
Validation RMSE: 3355.7299662977066
Testing RMS: 4054.187099633212
5 5
Validation RMSE: 3620.6857083676555
Testing RMS: 4185.7430818967205
7 1
Validation RMSE: 3446.740297026966
Testing RMS: 4163.665339112095
7 3
Validation RMSE: 3247.6984969441046
Testing RMS: 4170.86855860097
7 5
Validation RMSE: 3344.1158903263677
Testing RMS: 4141.903322108585


In [49]:
def subsample_colsample_tuning(subsample, colsample):

    xgb_reg = xgb.XGBRegressor(
        objective="reg:squarederror", 
        n_estimators=2400, 
        learning_rate=0.21, 
        max_depth=5,
        min_child_weight=1,
        subsample=subsample,
        colsample_bytree=colsample,
        random_state=42
    )


    xgb_reg.fit(X_train, y_train)

    y_pred = xgb_reg.predict(X_valid)

    rmse = root_mean_squared_error(y_valid, y_pred)
    print(f"Validation RMSE: {rmse}")

    y_pred = xgb_reg.predict(X_test_encoded)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"Testing RMSE: {rmse}")

In [50]:
for i in [0.6, 0.7, 0.8, 0.9, 1.0]:
    for j in [0.6, 0.7, 0.8, 0.9, 1.0]:
        print(i, j)
        subsample_colsample_tuning(i, j)


0.6 0.6
Validation RMSE: 3665.005958422882
Testing RMSE: 4683.719861769161
0.6 0.7
Validation RMSE: 3615.1950641020608
Testing RMSE: 4397.590034442283
0.6 0.8
Validation RMSE: 3671.2030859313923
Testing RMSE: 4265.95396441618
0.6 0.9
Validation RMSE: 3636.11570809115
Testing RMSE: 4281.827626531105
0.6 1.0
Validation RMSE: 3721.8109773920414
Testing RMSE: 4500.499752023027
0.7 0.6
Validation RMSE: 3641.588522263835
Testing RMSE: 4338.021147520145
0.7 0.7
Validation RMSE: 3560.356220432679
Testing RMSE: 4317.333517210113
0.7 0.8
Validation RMSE: 3621.6238720254255
Testing RMSE: 4020.5884265219543
0.7 0.9
Validation RMSE: 3578.674148474
Testing RMSE: 4130.931453616677
0.7 1.0
Validation RMSE: 3499.861191465097
Testing RMSE: 4388.60118715627
0.8 0.6
Validation RMSE: 3627.7534195231647
Testing RMSE: 4329.298083050738
0.8 0.7
Validation RMSE: 3498.7875263568917
Testing RMSE: 4237.58354201145
0.8 0.8
Validation RMSE: 3592.77700655565
Testing RMSE: 4191.530322427125
0.8 0.9
Validation RMSE: 3

In [51]:
def regularization_tuning(alpha, lambda_):

    xgb_reg = xgb.XGBRegressor(
        objective="reg:squarederror", 
        n_estimators=2400, 
        learning_rate=0.21, 
        max_depth=5,
        min_child_weight=1,
        subsample=1.0,
        colsample_bytree=0.9,
        reg_alpha=alpha,
        reg_lambda=lambda_,
        random_state=42
    )


    xgb_reg.fit(X_train, y_train)

    y_pred = xgb_reg.predict(X_valid)

    rmse = root_mean_squared_error(y_valid, y_pred)
    print(f"Validation RMSE: {rmse}")

    y_pred = xgb_reg.predict(X_test_encoded)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"Testing RMSE: {rmse}")

In [67]:
for alpha in [0.0, 0.02, 0.3]:
    for lambda_ in [0.0, 0.02, 0.3]:
        print(alpha, lambda_)
        regularization_tuning(alpha, lambda_)

0.0 0.0
Validation RMSE: 3392.052283518731
Testing RMSE: 4103.243471555578
0.0 0.02
Validation RMSE: 3332.672567222682
Testing RMSE: 3961.556035946388
0.0 0.3
Validation RMSE: 3252.398854644209
Testing RMSE: 3895.527300480161
0.02 0.0
Validation RMSE: 3409.0894307370663
Testing RMSE: 4111.994258973225
0.02 0.02
Validation RMSE: 3320.6244029454747
Testing RMSE: 3967.758490253882
0.02 0.3
Validation RMSE: 3252.399091395016
Testing RMSE: 3895.5272627432614
0.3 0.0
Validation RMSE: 3416.8132251002376
Testing RMSE: 4091.0462064877256
0.3 0.02
Validation RMSE: 3391.743849148163
Testing RMSE: 3959.6471242700313
0.3 0.3
Validation RMSE: 3212.240004416559
Testing RMSE: 3960.31460932489


In [76]:

xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror", 
    n_estimators=2400, 
    learning_rate=0.21, 
    max_depth=5,
    min_child_weight=1,
    subsample=1.0,
    colsample_bytree=0.9,
    reg_alpha=0.02,
    reg_lambda=0.3,
    random_state=42
)


xgb_reg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],)

y_pred = xgb_reg.predict(X_valid)

rmse = root_mean_squared_error(y_valid, y_pred)
print(f"Validation RMSE: {rmse}")

y_pred = xgb_reg.predict(X_test_encoded)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Testing RMSE: {rmse}")

[0]	validation_0-rmse:16808.10507
[1]	validation_0-rmse:15401.57741
[2]	validation_0-rmse:14329.37969
[3]	validation_0-rmse:13613.62240
[4]	validation_0-rmse:13045.81383
[5]	validation_0-rmse:12601.73723
[6]	validation_0-rmse:12341.53805
[7]	validation_0-rmse:12248.99879
[8]	validation_0-rmse:11799.97217
[9]	validation_0-rmse:11451.71165
[10]	validation_0-rmse:11366.22346
[11]	validation_0-rmse:11237.29892
[12]	validation_0-rmse:11175.51460
[13]	validation_0-rmse:11032.48191
[14]	validation_0-rmse:10856.43737
[15]	validation_0-rmse:10733.92646
[16]	validation_0-rmse:10432.81900
[17]	validation_0-rmse:10354.98258
[18]	validation_0-rmse:10288.49981
[19]	validation_0-rmse:10137.30033
[20]	validation_0-rmse:10082.42965
[21]	validation_0-rmse:9878.44739
[22]	validation_0-rmse:9770.27273
[23]	validation_0-rmse:9712.15963
[24]	validation_0-rmse:9590.51925
[25]	validation_0-rmse:9564.32795
[26]	validation_0-rmse:9515.42174
[27]	validation_0-rmse:9497.98487
[28]	validation_0-rmse:9349.87371
[29

In [93]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f'R^2 on test set: {r2:.2f}')

R^2 on test set: 0.96


In [92]:
df = pd.DataFrame(y_pred, columns=["Vehicle Population"])
df["Vehicle Population"] = df["Vehicle Population"].astype(int)
df.loc[df["Vehicle Population"] < 0, "Vehicle Population"] = 0

df.to_csv("submission.csv", index=False)