In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
import numpy as np

In [2]:
df = pd.read_csv("data/processed.csv",index_col=0)

In [3]:
X = df.drop(['Vehicle Population'],axis=1)

In [4]:
y = df['Vehicle Population']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
# scale to log(1+y) and during prediction exp1m (y) then predict
y_train = np.log1p(y_train)

In [7]:
model = DecisionTreeRegressor(    
    criterion='squared_error', 
    max_depth=100,
    random_state=42)

In [8]:
model.fit(X_train,y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
y_pred = np.expm1(y_pred)

In [11]:
print(f"Training RMSE: {root_mean_squared_error(y_test, y_pred)}")
print(f"Training R²: {r2_score(y_test, y_pred)}")

Training RMSE: 4922.387454902594
Training R²: 0.9314351903646558


In [12]:
scoring_df = pd.read_excel("../data/Chevron Challenge Materials/scoring.xlsx")

In [13]:
scoring_df.isna().any()

Date                                                 False
Vehicle Category                                     False
GVWR Class                                           False
Fuel Type                                            False
Model Year                                            True
Fuel Technology                                      False
Electric Mile Range                                  False
Number of Vehicles Registered at the Same Address    False
Region                                               False
Vehicle Population                                   False
dtype: bool

In [14]:
scoring_df[scoring_df['Model Year'].isna()]

Unnamed: 0,Date,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region,Vehicle Population
698,2024,MC,Not Applicable,Gasoline,,ICE,Not Applicable,≥4,Statewide,3740
812,2024,P,Not Applicable,Gasoline,,ICE,Not Applicable,≥4,Statewide,2871
1264,2024,T7,8,Gasoline,,ICE,Not Applicable,≥4,Statewide,1105
1328,2024,MC,Not Applicable,Gasoline,,ICE,Not Applicable,3,Statewide,969
1344,2024,P,Not Applicable,Gasoline,,ICE,Not Applicable,3,Statewide,922
...,...,...,...,...,...,...,...,...,...,...
7461,2024,T1,Unknown,Diesel,,ICE,Not Applicable,2,Statewide,1
7462,2024,T1,Unknown,Diesel,,ICE,Not Applicable,3,Statewide,1
7469,2024,P,Not Applicable,Diesel,,ICE,Not Applicable,1,Statewide,1
7537,2024,B,Not Applicable,Diesel,,ICE,Not Applicable,2,Statewide,1


In [15]:
scoring_df.shape

(7546, 10)

In [16]:
# preprocessing testing dataset
from convert import clean_encode_columns, impute_df

In [17]:
scoring_df = clean_encode_columns(scoring_df)

In [18]:
scoring_df = impute_df(scoring_df)

In [19]:
scoring_df.isna().any()

Date                                                 False
Model Year                                           False
Electric Mile Range                                  False
Number of Vehicles Registered at the Same Address    False
Vehicle Population                                   False
GVWR_Not Applicable                                  False
GVWR_Unknown                                         False
Vehicle Category_BS                                  False
Vehicle Category_BT                                  False
Vehicle Category_MC                                  False
Vehicle Category_MH                                  False
Vehicle Category_P                                   False
Vehicle Category_T1                                  False
Vehicle Category_T2                                  False
Vehicle Category_T3                                  False
Vehicle Category_T4                                  False
Vehicle Category_T5                                  Fal

In [22]:
scoring_X = scoring_df.drop(['Vehicle Population'],axis=1)

In [23]:
scoring_y = scoring_df['Vehicle Population']

In [24]:
missing_columns = set(X.columns) - set(scoring_X.columns)
# Add missing columns to scoring_X with default value (e.g., False)
for col in missing_columns:
    scoring_X[col] = False  # Use 0 if numerical, or False for boolean

# Ensure column order matches between X and scoring_X
scoring_X = scoring_X[X.columns]

In [28]:
scoring_predictions = model.predict(scoring_X)

In [29]:
scoring_predictions = np.expm1(scoring_predictions)

In [31]:
print(f"Scoring RMSE: {root_mean_squared_error(scoring_y, scoring_predictions)}")
print(f"Scoring R²: {r2_score(scoring_y, scoring_predictions)}")

Scoring RMSE: 10414.962459745335
Scoring R²: 0.7137516728571114
