In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

In [2]:
# step 1: import dataset
df = pd.read_excel("../data/Chevron Challenge Materials/scoring.xlsx",index_col=0)

In [3]:
df.head()

Unnamed: 0_level_0,Vehicle Category,GVWR Class,Fuel Type,Model Year,Fuel Technology,Electric Mile Range,Number of Vehicles Registered at the Same Address,Region,Vehicle Population
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024,P,Not Applicable,Gasoline,2020.0,ICE,Not Applicable,≥4,Statewide,316065
2024,P,Not Applicable,Gasoline,2021.0,ICE,Not Applicable,≥4,Statewide,315986
2024,P,Not Applicable,Gasoline,2022.0,ICE,Not Applicable,≥4,Statewide,306487
2024,P,Not Applicable,Gasoline,2024.0,ICE,Not Applicable,≥4,Statewide,284754
2024,P,Not Applicable,Gasoline,2023.0,ICE,Not Applicable,≥4,Statewide,284153


In [4]:
# perform cleaning to scoring dataset to produce predictions
df.isna().any()

Vehicle Category                                     False
GVWR Class                                           False
Fuel Type                                            False
Model Year                                            True
Fuel Technology                                      False
Electric Mile Range                                  False
Number of Vehicles Registered at the Same Address    False
Region                                               False
Vehicle Population                                   False
dtype: bool

In [5]:
df.dropna(inplace=True)

In [6]:
df['Model Year'] = df['Model Year'].astype('Int64')

In [7]:
df = df.drop(['Region'],axis=1)

In [8]:
gvwr_map = {
    '1': 'Light-Duty',
    '2': 'Light-Duty',
    '3': 'Medium-Duty',
    '4': 'Medium-Duty',
    '5': 'Medium-Duty',
    '6': 'Medium-Duty',
    '7': 'Heavy-Duty',
    '8': 'Heavy-Duty',
    'Not Applicable': 'Not Applicable',
    'Unknown': 'Unknown'
}

df['GVWR Group'] = df['GVWR Class'].map(gvwr_map)
df = pd.get_dummies(df, columns=['GVWR Group'], prefix='GVWR')

In [9]:
df = df.drop(['GVWR Class'],axis=1)

In [10]:
# Define mappings for ordinal encoding
mile_range_mapping = {
    'Not Applicable': 0,
    '0 to 50 miles': 1,
    '51 to 100 miles': 2,
    '101 to 150 miles': 3,
    '>150 miles': 4,
    'Unknown': -1
}

vehicle_count_mapping = {
    '1': 1,
    '2': 2,
    '3': 3,
    '≥4': 4,
    'Unknown': -1
}

# Apply ordinal encoding
df['Electric Mile Range'] = df['Electric Mile Range'].map(mile_range_mapping)
df['Number of Vehicles Registered at the Same Address'] = df['Number of Vehicles Registered at the Same Address'].map(vehicle_count_mapping)

In [11]:
df = pd.get_dummies(df, columns=['Vehicle Category', 'Fuel Type', 'Fuel Technology'], drop_first=True)

In [12]:
df.columns

Index(['Model Year', 'Electric Mile Range',
       'Number of Vehicles Registered at the Same Address',
       'Vehicle Population', 'GVWR_Not Applicable', 'GVWR_Unknown',
       'Vehicle Category_BS', 'Vehicle Category_BT', 'Vehicle Category_MC',
       'Vehicle Category_MH', 'Vehicle Category_P', 'Vehicle Category_T1',
       'Vehicle Category_T2', 'Vehicle Category_T3', 'Vehicle Category_T4',
       'Vehicle Category_T5', 'Vehicle Category_T6', 'Vehicle Category_T7',
       'Fuel Type_Electric', 'Fuel Type_Gasoline', 'Fuel Type_Hydrogen',
       'Fuel Type_Natural Gas', 'Fuel Technology_FCEV', 'Fuel Technology_ICE',
       'Fuel Technology_PHEV'],
      dtype='object')

In [13]:
df.to_csv("../data/encoded-scoring.csv")