In [1]:
!pip install -r ../../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np

import random
from itertools import product

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error

from tqdm import tqdm
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib

import os
import shutil
import tempfile

import optuna
from openpyxl import load_workbook

from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

random.seed(0)
np.random.seed(0)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from tqdm.autonotebook import tqdm


### download data

In [3]:
# Download Data Dictionary
data_dict_path = '../../data/data_dictionary.xlsx'
data_dictionary = pd.read_excel(data_dict_path)

# Download Scoring File -- what the input data will be for the end
scoring_path = '../../data/scoring.xlsx'
scoring = pd.read_excel(scoring_path)

# Download Submission Format -- what the format of the output should be from my model
submission_format_path = '../../data/submission_format.csv'
submission_format = pd.read_csv(submission_format_path)

# Download Training Data -- what the training data is for this
training_path = '../../data/training.xlsx'
training = pd.read_excel(training_path).dropna()

# new data from Fred: https://fred.stlouisfed.org/series/CUSR0000SETA02
cpi_used_cars_path = '../../data/CPI_UsedCars_US.xlsx'
cpi_used_cars = pd.read_excel(cpi_used_cars_path, sheet_name='Monthly')
cpi_used_cars.columns = ['Date', 'CPI']
cpi_used_cars['Year'] = cpi_used_cars['Date'].dt.year
average_cpi_by_year = cpi_used_cars.groupby('Year')['CPI'].mean().reset_index().rename(columns={'Year': 'Model Year'})
# since average cpi by year is missing 2025 and 2026, forecast using a neural prophet model
def forecast_cpi_polynomial(average_cpi_by_year, forecast_periods=5, degree=2):
    # Ensure data is sorted by 'Model Year'
    average_cpi_by_year = average_cpi_by_year.sort_values(by='Model Year')

    # Extract features and target variable
    X = average_cpi_by_year[['Model Year']]
    y = average_cpi_by_year['CPI']

    # Generate polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)

    # Fit the polynomial regression model
    model = LinearRegression()
    model.fit(X_poly, y)

    # Generate future 'Model Year' values
    last_year = X['Model Year'].max()
    future_years = np.arange(last_year + 1, last_year + 1 + forecast_periods).reshape(-1, 1)

    # Transform future years to polynomial features
    future_years_poly = poly.transform(future_years)

    # Predict future CPI values
    future_cpi = model.predict(future_years_poly)

    # Create DataFrame for future predictions
    future_df = pd.DataFrame({
        'Model Year': future_years.flatten(),
        'CPI': future_cpi
    })

    # Combine original and forecasted data
    combined_df = pd.concat([average_cpi_by_year, future_df], ignore_index=True)

    return combined_df

average_cpi_by_year = forecast_cpi_polynomial(average_cpi_by_year, 2, 4)




In [4]:
def clean_input_data(data):
    # add the CPI of Used Cars by City Average
    clean_data = data.merge(average_cpi_by_year, on='Model Year', how='left')
    clean_data['Model Year'] = clean_data['Model Year'].astype(str)
    
    # handle nulls
    clean_data = clean_data.replace({"nan": np.nan})
    clean_data['Model Year'] = clean_data['Model Year'].fillna("Missing")

    # fill missing CPI's with average CPI
    clean_data['CPI'] = clean_data['CPI'].fillna(clean_data['CPI'].mean())
    
    return clean_data

### model functions

In [5]:
def fit_linear_regression(X, y):
    model = LinearRegression()
    model.fit(X, y)

    return model

class ConstantModel:
    # A fallback model that always predicts a constant value
    def __init__(self, value):
        self.value = value  # Store the constant target value

    def predict(self, X):
        #  mimic contant
        if not hasattr(X, '__len__'):  # Ensure X has a length (is iterable)
            return self.value  # Just return the value if X isn't iterable
        return [self.value] * len(X)  # Normal case


def fit_catboost(X, y):
    try:

        categorical_cols = list(X.columns)
        if 'CPI' in categorical_cols:
            categorical_cols.remove('CPI')

        model = CatBoostRegressor(
            iterations=100,
            learning_rate=0.05,
            depth=6,
            cat_features=categorical_cols,
            random_seed=42,
            allow_writing_files=False,  # Prevents CatBoost from creating the catboost_info directory
            verbose=0
        )

        model.fit(X, y)

        # Clear CatBoost temporary files
        catboost_tmp_dir = tempfile.gettempdir()  # Get system temp dir
        for root, dirs, files in os.walk(catboost_tmp_dir):
            for dir_name in dirs:
                if "catboost" in dir_name:
                    shutil.rmtree(os.path.join(root, dir_name), ignore_errors=True)

        return model
    except:
        singular_value = y.iloc[0] if hasattr(y, 'iloc') else y[0]
        return ConstantModel(singular_value)
    


### download train and test data

In [6]:
clean_training = clean_input_data(training)
clean_scoring = clean_input_data(scoring)

car_data = pd.concat([clean_training, clean_scoring]).reset_index(drop=True)
car_data['Model Year'] = car_data['Model Year'].astype(str)

categorical_cols = car_data.select_dtypes(include=['object', 'category']).columns.tolist()
encoded_car_data =  pd.get_dummies(car_data, columns=categorical_cols)

train_indices = range(0, len(clean_training))
test_indices = range(len(clean_training), len(car_data))

train = car_data.loc[train_indices].copy()
test = car_data.loc[test_indices].copy()

train_encoded = encoded_car_data.loc[train_indices].copy()
test_encoded = encoded_car_data.loc[test_indices].copy()

### model testing functions

In [7]:
def get_models_by_basis(car_data, encoded_car_data, train_indices, basis_columns, prediction_columns, target_col, model_func, encode):

    train_data = car_data.loc[train_indices]
    encoded_train = encoded_car_data.loc[train_indices]

    # Define the columns used to create different models
    basis_values = [train_data[col].unique().tolist() for col in basis_columns] # these are the potential values for each basis

    # Generate all possible (Model Year, Fuel Type) combinations as tuples
    basis_combinations = [tuple(values) for values in product(*basis_values)] # these are the combos of basis's so like 2020 Electric for example

    # Dictionary to store trained models, using (Model Year, Fuel Type) as the key
    trained_models = {}
    for basis_key in basis_combinations:

        basis_dict = {col: value for col, value in zip(basis_columns, basis_key)}
        # Filter training data for the specific Model Year & Fuel Type
        basis_indices = train_data[
            train_data[basis_dict.keys()].eq(pd.Series(basis_dict)).all(axis=1)
        ].index 

        # check if encoded
        model_data = encoded_train.copy() if encode else train_data.copy()

        # initialize X and y
        X = model_data.loc[basis_indices, prediction_columns] if len(basis_indices) != 0 else pd.DataFrame()
        y = model_data.loc[basis_indices][target_col] if len(basis_indices) != 0 else pd.DataFrame()


        if len(X) > 10: # if there are at least 10 values
            model = model_func(X, y)
            safe = True
        else:
            model = model_func(model_data[prediction_columns], model_data[target_col])
            safe = False

        # Store trained model using the (Model Year, Fuel Type) tuple as the key
        trained_models[basis_key] = (model, safe)

    return trained_models

In [8]:
def get_models_by_basis_with_hyperparams(car_data, encoded_car_data, basis_columns, prediction_columns, target_col, model_func, encode, params_dict):
    # getting the models with specified hyperparameters. Make sure params dict keys are the same as the basis combinations ***

    train_data = car_data.loc[train_indices]
    encoded_train = encoded_car_data.loc[train_indices]

    # Define the columns used to create different models
    basis_values = [train_data[col].unique().tolist() for col in basis_columns] # these are the potential values for each basis

    # Generate all possible (Model Year, Fuel Type) combinations as tuples
    basis_combinations = [tuple(values) for values in product(*basis_values)] # these are the combos of basis's so like 2020 Electric for example

    # Dictionary to store trained models, using (Model Year, Fuel Type) as the key
    trained_models = {}
    for basis_key in basis_combinations:

        optimal_params = params_dict[basis_key]

        basis_dict = {col: value for col, value in zip(basis_columns, basis_key)}
        # Filter training data for the specific Model Year & Fuel Type
        basis_indices = train_data[
            train_data[basis_dict.keys()].eq(pd.Series(basis_dict)).all(axis=1)
        ].index 

        # check if encoded
        model_data = encoded_train.copy() if encode else train_data.copy()
        # initialize X and y
        X = model_data.loc[basis_indices, prediction_columns] if len(basis_indices) != 0 else pd.DataFrame()
        y = model_data.loc[basis_indices][target_col] if len(basis_indices) != 0 else pd.DataFrame()

        cat_features = [col for col in prediction_columns if train_data[col].dtype == 'object']
        model = CatBoostRegressor(**optimal_params, cat_features=cat_features)
        if len(X) > 10: # if there are at least 10 values
            model.fit(X, y)
            safe = True
        else:
            model.fit(model_data[prediction_columns], model_data[target_col])
            safe = False

        # Store trained model using the (Model Year, Fuel Type) tuple as the key
        trained_models[basis_key] = (model, safe)

    return trained_models



In [9]:
def calculate_test_rmse(car_data, encoded_car_data, test_indices, trained_models, basis_columns, prediction_columns, target_col, encode):
    all_prediction_rows = []
    test = car_data.loc[test_indices]

    for test_idx in test_indices:

        row = car_data.loc[test_idx]

        # get the basis values in the row (like 2020 for Model Year if Model Year is a basis)
        model_basis_dict = {col: row[col] for col in basis_columns}

        # get the model based on the basis
        model, safe = trained_models[tuple(model_basis_dict.values())]

        # get the encoded row
        if encode:
            formatted_row = encoded_car_data.loc[test_idx]
            formatted_row = formatted_row[prediction_columns]
        else:
            formatted_row = row[prediction_columns]

        # get the prediction vs. actual
        prediction = model.predict(pd.DataFrame(formatted_row).T)[0]
        actual = test.loc[test_idx, target_col]

        prediction_row = pd.DataFrame({
            'index': [test_idx],
            'prediction': [round(prediction, 0)],
            'actual': [actual],
            'safe': [safe]
        })
        all_prediction_rows.append(prediction_row)

    prediction_df = pd.concat(all_prediction_rows)
    rmse = np.sqrt(mean_squared_error(prediction_df['actual'], prediction_df['prediction']))

    return rmse, prediction_df

### model testing

##### <i>Get the combinations of basis columns and prediction columns

In [10]:
column_options = list(set(car_data.columns) - {'Date', 'Vehicle Population'})

basis_prediction_combos = []
for _ in range(250):
    # Select 1-3 random basis columns
    basis_columns = random.sample(column_options, random.randint(1, 2))
    if 'CPI' in basis_columns:
        basis_columns.remove('CPI')
    if len(basis_columns) == 0:
        continue
    
    # Select 3-7 prediction columns that are **not in basis_columns**
    remaining_columns = list(set(column_options) - set(basis_columns))
    prediction_columns = random.sample(remaining_columns, random.randint(3, 8-len(basis_columns)))
    
    # Store as a tuple
    basis_prediction_combos.append((basis_columns, prediction_columns))


##### <i>Run tests to find best basis and prediction columns

In [11]:
runOptimization = False

# Define function to process each set of columns
model_func = fit_linear_regression
encode = True

# Define file path for saving results
results_file = f'../../results/{model_func.__name__}_optim.csv'

# Load existing results if the file exists
if os.path.exists(results_file):
    model_results = pd.read_csv(results_file)
else:
    model_results = pd.DataFrame(columns=['basis_columns', 'prediction_columns', 'rmse'])

# Convert existing basis-prediction combinations to a set for quick lookup
existing_combos = set(zip(model_results['basis_columns'].astype(str), model_results['prediction_columns'].astype(str)))

target_col = 'Vehicle Population'

def process_combo(basis_columns, prediction_columns, save=True):
    try:
        
        # Convert combo to string format for checking existence
        combo_key = (str(basis_columns), str(prediction_columns))
        if combo_key in existing_combos:
            print(f"Skipping already processed: {basis_columns}, {prediction_columns}")
            return None

        # Ensure prediction columns do not include basis columns
        if encode:
            clean_prediction_columns = list(set(col for col in train_encoded.columns if col.split("_")[0] not in basis_columns) - {target_col, 'Date'})
        else:
            clean_prediction_columns = prediction_columns
        
        # Train models and calculate RMSE
        trained_models = get_models_by_basis(car_data, encoded_car_data, train_indices, basis_columns, clean_prediction_columns, target_col, model_func, encode)
        rmse = calculate_test_rmse(car_data, encoded_car_data, test_indices, trained_models, basis_columns, clean_prediction_columns, target_col, encode)[0]

        # Create DataFrame for this iteration
        result_df = pd.DataFrame({
            'basis_columns': [basis_columns],
            'prediction_columns': [prediction_columns],
            'rmse': [rmse]
        })

        if save:
            # Append results to CSV immediately
            result_df.to_csv(results_file, index=False, mode='a', header=not os.path.exists(results_file))

        try:
            shutil.rmtree('catboost_info')
        except:
            pass

        return result_df
    except Exception as e:
        print(f"Exception: {e}")
        return None

if runOptimization:
    # Initialize tqdm progress bar
    with tqdm_joblib(tqdm(desc="Processing Models", total=len(basis_prediction_combos))) as progress_bar:
        results = Parallel(n_jobs=-1)(
            delayed(process_combo)(basis, pred) for basis, pred in basis_prediction_combos
        )

##### <i>Bayesian Optimziation

###### Optimization Functions

In [12]:
def objective(trial, X, y, cat_features):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),                    
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),     
        'border_count': trial.suggest_int('border_count', 32, 255),                  
        'random_strength': trial.suggest_float('random_strength', 0, 1),            
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),  
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),           
        'verbose': 0
    }
    split_idx = int(0.8 * len(X))
    X_train, X_val = X[:split_idx], X[split_idx:]
    y_train, y_val = y[:split_idx], y[split_idx:]

    model = CatBoostRegressor(**params, cat_features=cat_features)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

def optimize_catboost(X, y, cat_features, n_trials=20):
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, X, y, cat_features), n_trials=n_trials, timeout=30)
    best_params = study.best_params

    model = CatBoostRegressor(**best_params, cat_features=cat_features)
    model.fit(X, y, verbose=0)
    return model, best_params

In [13]:
def get_models_by_basis_optimization(train_data, train_encoded, basis_columns, prediction_columns, target_col, model_func, encode):

    basis_values = [train_data[col].unique().tolist() for col in basis_columns]
    basis_combinations = [tuple(values) for values in product(*basis_values)]

    cat_features = [col for col in prediction_columns if train_data[col].dtype == 'object']

    if not cat_features:
        cat_features = []

    trained_models = {}
    for basis_key in basis_combinations:
        basis_dict = {col: value for col, value in zip(basis_columns, basis_key)}
        
        basis_indices = train_data[
            train_data[list(basis_dict.keys())].eq(pd.Series(basis_dict)).all(axis=1)
        ].index

        model_data = train_encoded.copy() if encode else train_data.copy()
        X = model_data.loc[basis_indices, prediction_columns]
        y = model_data.loc[basis_indices, target_col]

        if len(X) > 10:  # Sufficient data
            model, params = optimize_catboost(X, y, cat_features)
            safe = True
        else:  # Fallback model for small datasets
            model = model_func(model_data[prediction_columns], model_data[target_col])
            params = None
            safe = False

        trained_models[basis_key] = (model, safe, params)

    return trained_models


def calculate_rmse_optimization(test_data, encoded_test_data, trained_models, basis_columns, prediction_columns, target_col, encode):
    all_predictions = []

    for idx, row in test_data.iterrows():
        model_basis_key = tuple(row[basis_columns])
        model_info = trained_models.get(model_basis_key, (None, False, None))
        model, safe, params = model_info

        if model:
            formatted_row = (encoded_test_data if encode else test_data).loc[idx, prediction_columns]
            prediction = model.predict(pd.DataFrame(formatted_row).T)[0]
            actual = row[target_col]

            all_predictions.append({
                'index': idx,
                'prediction': round(prediction, 0),
                'actual': actual,
                'basis_key': model_basis_key,
                'safe': safe
            })

    if all_predictions:
        pred_df = pd.DataFrame(all_predictions)
        rmse = np.sqrt(mean_squared_error(pred_df['actual'], pred_df['prediction']))
    else:
        rmse = None  # No predictions made

    return rmse

In [14]:
output_path = 'results/catboost_bayesian_optimization.xlsx'

def append_to_excel(df, file_path):
    if not os.path.exists(file_path):
        df.to_excel(file_path, index=False)
    else:
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
            workbook = load_workbook(file_path)
            sheet = workbook.active
            startrow = sheet.max_row
            
            df.to_excel(writer, index=False, header=False, startrow=startrow)

###### run optimization

In [15]:
runOptimization = False

model_func = fit_catboost
column_options = list(set(train.columns) - {'Date', 'Vehicle Population'})
encode = False

def process_basis_prediction_combo(basis_columns, prediction_columns):
    try:
        target_col = 'Vehicle Population'
        
        clean_prediction_columns = (
            [col for col in train_encoded.columns if col.split("_")[0] not in basis_columns and col not in [target_col, 'Date']]
            if encode else prediction_columns
        )
        
        trained_models = get_models_by_basis_optimization(train, train_encoded, basis_columns, clean_prediction_columns, target_col, model_func, encode)
        
        rmse = calculate_rmse_optimization(test, test_encoded, trained_models, basis_columns, clean_prediction_columns, target_col, encode)

        basis_keys_params = {
            basis_key: params for basis_key, (_, _, params) in trained_models.items()
        }

        results = pd.DataFrame({
            'basis_columns': [basis_columns],
            'basis_keys_used': [list(trained_models.keys())],
            'basis_keys_params': [basis_keys_params],
            'prediction_columns': [prediction_columns],
            'rmse': [rmse]
        })

        print(results) 

        append_to_excel(results, output_path)

        return results

    except Exception as e:
        print(f"Error processing combo {basis_columns}, {prediction_columns}: {e}")
        return pd.DataFrame()

model_results_list = []
if runOptimization:
    with tqdm(desc="Processing Models", total=len(basis_prediction_combos)) as pbar:
        model_results_list = Parallel(n_jobs=-1)(
            delayed(process_basis_prediction_combo)(basis, pred) for basis, pred in basis_prediction_combos
        )
        pbar.update(len(basis_prediction_combos))


### run model on new data

In [16]:
encoded_scoring =  encoded_car_data.loc[test_indices]

# initialize inputs
model_func = fit_catboost
encode = False
basis_columns = ['Vehicle Category']
prediction_columns = ['Fuel Technology', 'CPI', 'Number of Vehicles Registered at the Same Address', 'GVWR Class', 'Electric Mile Range', 'Fuel Type', 'Model Year']
target_col = 'Vehicle Population'

# clean prediction columns if necessary
if encode:
    clean_prediction_columns = list(set(col for col in encoded_scoring.columns if col.split("_")[0] not in basis_columns) - {target_col, 'Date'})
else:
    clean_prediction_columns = prediction_columns

basis_keys = [('P',), ('T2',), ('T3',), ('T1',), ('MC',), ('T4',), ('T7',), ('T6',), ('T5',), ('MH',), ('BS',), ('B',), ('BT',)]

params_dict = {
    ('P',): {'iterations': 853, 'learning_rate': 0.2429781556456842, 'depth': 6, 'l2_leaf_reg': 2.282136581605469, 'bagging_temperature': 0.21448680294505562, 'border_count': 202, 'random_strength': 0.5230550908979421, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'verbose':0},
    ('T2',): {'iterations': 335, 'learning_rate': 0.26931213330418335, 'depth': 10, 'l2_leaf_reg': 9.640870180295874, 'bagging_temperature': 0.30115087324036427, 'border_count': 178, 'random_strength': 0.39990182837262866, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 14, 'verbose':0},
    ('T3',): {'iterations': 865, 'learning_rate': 0.1010367015771094, 'depth': 9, 'l2_leaf_reg': 7.42764752331621, 'bagging_temperature': 0.6459945532391831, 'border_count': 136, 'random_strength': 0.3485326296696091, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 19, 'verbose':0},
    ('T1',): {'iterations': 171, 'learning_rate': 0.10839852850884317, 'depth': 7, 'l2_leaf_reg': 6.465152613240907, 'bagging_temperature': 0.05956399621682334, 'border_count': 32, 'random_strength': 0.9969824754272121, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 1, 'verbose':0},
    ('MC',): {'iterations': 493, 'learning_rate': 0.017972336996238937, 'depth': 5, 'l2_leaf_reg': 9.904395136404345, 'bagging_temperature': 0.003721695868538509, 'border_count': 119, 'random_strength': 0.9731604471489881, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 15, 'verbose':0},
    ('T4',): {'iterations': 293, 'learning_rate': 0.16845525328753766, 'depth': 9, 'l2_leaf_reg': 7.3028486840112725, 'bagging_temperature': 0.1990811649266926, 'border_count': 84, 'random_strength': 0.8609406400707353, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'verbose':0},
    ('T7',): {'iterations': 801, 'learning_rate': 0.08299420642146778, 'depth': 9, 'l2_leaf_reg': 1.9216760380872735, 'bagging_temperature': 0.0009397731362831428, 'border_count': 213, 'random_strength': 0.19396619269250526, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 16, 'verbose':0},
    ('T6',): {'iterations': 610, 'learning_rate': 0.12293705953473834, 'depth': 8, 'l2_leaf_reg': 4.96309503860056, 'bagging_temperature': 0.873646019286267, 'border_count': 191, 'random_strength': 0.8196419736475136, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'verbose':0},
    ('T5',): {'iterations': 563, 'learning_rate': 0.21256793340510682, 'depth': 7, 'l2_leaf_reg': 6.892637478520299, 'bagging_temperature': 0.9724910160820344, 'border_count': 191, 'random_strength': 0.4347428589259234, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'verbose':0},
    ('MH',): {'iterations': 885, 'learning_rate': 0.26170267256322144, 'depth': 8, 'l2_leaf_reg': 6.994752208188576, 'bagging_temperature': 0.9067775235329057, 'border_count': 149, 'random_strength': 0.29522679258872647, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 13, 'verbose': 0},
    ('BS',): {'iterations': 765, 'learning_rate': 0.0863404017910146, 'depth': 4, 'l2_leaf_reg': 6.859286521399906, 'bagging_temperature': 0.7652808535846216, 'border_count': 141, 'random_strength': 0.6183738585814146, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'verbose': 0},
    ('B',): {'iterations': 272, 'learning_rate': 0.23433232647434127, 'depth': 5, 'l2_leaf_reg': 3.0329940707082588, 'bagging_temperature': 0.13616676011461504, 'border_count': 238, 'random_strength': 0.7781224191172322, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 18, 'verbose': 0},
    ('BT',): {'iterations': 768, 'learning_rate': 0.2575430567722977, 'depth': 4, 'l2_leaf_reg': 1.6541315061037025, 'bagging_temperature': 0.628596056037448, 'border_count': 153, 'random_strength': 0.6865879212812991, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'verbose': 0}
}

trained_models = get_models_by_basis_with_hyperparams(car_data, encoded_car_data, basis_columns, clean_prediction_columns, target_col, model_func, encode, params_dict)
rmse, pred_df = calculate_test_rmse(car_data, encoded_car_data, test_indices, trained_models, basis_columns, clean_prediction_columns, target_col, encode)

submission = pred_df[['prediction']].rename(columns={'prediction': 'Vehicle Population'}).reset_index(drop=True)

rmse

7374.301964135726

In [17]:
submission

Unnamed: 0,Vehicle Population
0,287075.0
1,266228.0
2,297439.0
3,94319.0
4,233580.0
...,...
7541,6.0
7542,2.0
7543,2.0
7544,5.0
