In [1]:
import pandas as pd
import numpy as np

import random
from itertools import product

from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error

from tqdm import tqdm
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib

import os


  from tqdm.autonotebook import tqdm


### download data

In [2]:
# Download Data Dictionary
data_dict_path = '../../data/data_dictionary.xlsx'
data_dictionary = pd.read_excel(data_dict_path)

# Download Scoring File -- what the input data will be for the end
scoring_path = '../../data/scoring.xlsx'
scoring = pd.read_excel(scoring_path)

# Download Submission Format -- what the format of the output should be from my model
submission_format_path = '../../data/submission_format.csv'
submission_format = pd.read_csv(submission_format_path)

# Download Training Data -- what the training data is for this
training_path = '../../data/training.xlsx'
training = pd.read_excel(training_path).dropna()

### model functions

In [3]:
def fit_linear_regression(X, y):
    model = LinearRegression()
    model.fit(X, y)

    return model

class ConstantModel:
    # A fallback model that always predicts a constant value
    def __init__(self, value):
        self.value = value  # Store the constant target value

    def predict(self, X):
        #  mimic contant
        if not hasattr(X, '__len__'):  # Ensure X has a length (is iterable)
            return self.value  # Just return the value if X isn't iterable
        return [self.value] * len(X)  # Normal case

def fit_catboost(X, y):
    try:
        model = CatBoostRegressor(
            iterations=100,
            learning_rate=0.05,
            depth=6,
            cat_features=list(X.columns),
            verbose=0
        )

        model.fit(X, y)

        return model
    except:
        singular_value = y.iloc[0] if hasattr(y, 'iloc') else y[0]  # Extract the constant target value

        return ConstantModel(singular_value)


### download train and test data

In [4]:
car_data = training.copy()
car_data['Model Year'] = car_data['Model Year'].astype(str)

categorical_cols = car_data.select_dtypes(include=['object', 'category']).columns.tolist()
encoded_car_data =  pd.get_dummies(car_data, columns=categorical_cols)

train_indices = car_data.sample(frac=0.8, random_state=0).index
test_indices = car_data.drop(train_indices).index

train = car_data.loc[train_indices].copy()
test = car_data.loc[test_indices].copy()

train_encoded = encoded_car_data.loc[train_indices].copy()
test_encoded = encoded_car_data.loc[test_indices].copy()

### model testing functions

In [5]:
def get_models_by_basis(train_data, train_encoded, basis_columns, prediction_columns, target_col, model_func, encode):

    # Define the columns used to create different models
    basis_values = [train_data[col].unique().tolist() for col in basis_columns] # these are the potential values for each basis

    # Generate all possible (Model Year, Fuel Type) combinations as tuples
    basis_combinations = [tuple(values) for values in product(*basis_values)] # these are the combos of basis's so like 2020 Electric for example

    # Dictionary to store trained models, using (Model Year, Fuel Type) as the key
    trained_models = {}
    for basis_key in basis_combinations:

        basis_dict = {col: value for col, value in zip(basis_columns, basis_key)}
        # Filter training data for the specific Model Year & Fuel Type
        basis_indices = train_data[
            train_data[basis_dict.keys()].eq(pd.Series(basis_dict)).all(axis=1)
        ].index 

        # check if encoded
        model_data = train_encoded.copy() if encode else train_data.copy()

        # initialize X and y
        X = model_data.loc[basis_indices, prediction_columns]
        y = model_data.loc[basis_indices][target_col]


        if len(X) > 10: # if there are at least 25 values
            model = model_func(X, y)
            safe = True
        else:
            model = model_func(model_data[prediction_columns], model_data[target_col])
            safe = False

        # Store trained model using the (Model Year, Fuel Type) tuple as the key
        trained_models[basis_key] = (model, safe)

    return trained_models

In [6]:
def calculate_rmse(car_data, encoded_car_data, trained_models, basis_columns, prediction_columns, target_col, encode):
    all_prediction_rows = []
    for test_idx in test_indices:

        row = car_data.loc[test_idx]

        # get the basis values in the row (like 2020 for Model Year if Model Year is a basis)
        model_basis_dict = {col: row[col] for col in basis_columns}

        # get the model based on the basis
        model, safe = trained_models[tuple(model_basis_dict.values())]

        # get the encoded row
        if encode:
            formatted_row = encoded_car_data.loc[test_idx]
            formatted_row = formatted_row[prediction_columns]
        else:
            formatted_row = row[prediction_columns]

        # get the prediction vs. actual
        prediction = model.predict(pd.DataFrame(formatted_row).T)[0]
        actual = test.loc[test_idx, target_col]

        prediction_row = pd.DataFrame({
            'index': [test_idx],
            'prediction': [round(prediction, 0)],
            'actual': [actual],
            'safe': [safe]
        })
        all_prediction_rows.append(prediction_row)

    prediction_df = pd.concat(all_prediction_rows)
    rmse = np.sqrt(mean_squared_error(prediction_df['actual'], prediction_df['prediction']))

    return rmse

### model testing

###### Get the combinations of basis columns and prediction columns

In [7]:
column_options = list(set(train.columns) - {'Date', 'Vehicle Population'})

basis_prediction_combos = []
for _ in range(1000):
    # Select 1-3 random basis columns
    basis_columns = random.sample(column_options, random.randint(1, 3))
    
    # Select 3-7 prediction columns that are **not in basis_columns**
    remaining_columns = list(set(column_options) - set(basis_columns))
    prediction_columns = random.sample(remaining_columns, random.randint(3, 8-len(basis_columns)))
    
    # Store as a tuple
    basis_prediction_combos.append((basis_columns, prediction_columns))


###### Run the tests

In [None]:
# Define function to process each set of columns
model_func = fit_catboost
encode = False

# Define file path for saving results
results_file = '../../results/catboost_optim.csv'

# Load existing results if the file exists
if os.path.exists(results_file):
    model_results = pd.read_csv(results_file)
else:
    model_results = pd.DataFrame(columns=['basis_columns', 'prediction_columns', 'rmse'])

# Convert existing basis-prediction combinations to a set for quick lookup
existing_combos = set(zip(model_results['basis_columns'].astype(str), model_results['prediction_columns'].astype(str)))

def process_basis_prediction_combo(basis_columns, prediction_columns):
    target_col = 'Vehicle Population'
    
    # Convert combo to string format for checking existence
    combo_key = (str(basis_columns), str(prediction_columns))
    if combo_key in existing_combos:
        print(f"Skipping already processed: {basis_columns}, {prediction_columns}")
        return None  # Skip iteration
    
    # Ensure prediction columns do not include basis columns
    if encode:
        clean_prediction_columns = list(set(col for col in train_encoded.columns if col.split("_")[0] not in basis_columns) - {target_col, 'Date'})
    else:
        clean_prediction_columns = prediction_columns
    
    # Train models and calculate RMSE
    trained_models = get_models_by_basis(train, train_encoded, basis_columns, clean_prediction_columns, target_col, model_func, encode)
    rmse = calculate_rmse(car_data, encoded_car_data, trained_models, basis_columns, clean_prediction_columns, target_col, encode)

    # Create DataFrame for this iteration
    result_df = pd.DataFrame({
        'basis_columns': [basis_columns],
        'prediction_columns': [prediction_columns],
        'rmse': [rmse]
    })

    # Append results to CSV immediately
    result_df.to_csv(results_file, index=False, mode='a', header=not os.path.exists(results_file))
    
    return result_df


# Run in parallel using tqdm_joblib
with tqdm_joblib(tqdm(desc="Processing Models", total=len(basis_prediction_combos))):
    model_results_list = Parallel(n_jobs=-1)(
        delayed(process_basis_prediction_combo)(basis, pred) for basis, pred in basis_prediction_combos
    )

# Remove `None` values from the list (skipped iterations)
model_results_list = [df for df in model_results_list if df is not None]

# Append all new results to the file
if model_results_list:
    model_results = pd.concat(model_results_list, ignore_index=True)
    model_results.to_csv(results_file, index=False)


Processing Models:   0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Skipping already processed: ['Number of Vehicles Registered at the Same Address'], ['Fuel Type', 'Region', 'Model Year', 'Vehicle Category', 'Electric Mile Range']
Skipping already processed: ['Fuel Type', 'Fuel Technology'], ['GVWR Class', 'Electric Mile Range', 'Region', 'Vehicle Category', 'Model Year', 'Number of Vehicles Registered at the Same Address']
Skipping already processed: ['Fuel Technology', 'Vehicle Category', 'Region'], ['Model Year', 'Electric Mile Range', 'Fuel Type']
Skipping already processed: ['Model Year'], ['GVWR Class', 'Fuel Type', 'Fuel Technology', 'Electric Mile Range', 'Number of Vehicles Registered at the Same Address', 'Region']
Skipping already processed: ['Vehicle Category', 'Number of Vehicles Registered at the Same Address'], ['GVWR Class', 'Fuel Type', 'Region']
Skipping already processed: ['GVWR Class', 'Number of Vehicles Registered at the Same Address', 'Fuel Type'], ['Electric Mile Range', 'Vehicle Category', 'Region']
Skipping already processed:

