In [None]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRegressor
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from itertools import product
from sklearn.preprocessing import PolynomialFeatures
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import optuna
from tqdm_joblib import tqdm_joblib
from sklearn.linear_model import LinearRegression
import os
from openpyxl import load_workbook

random.seed(0)
np.random.seed(0)

# Download data and create train + test sets

In [None]:
data_dict_path = '/Users/teymour/Desktop/Datathon/data/data_dictionary.xlsx'
data_dictionary = pd.read_excel(data_dict_path)

scoring_path = '/Users/teymour/Desktop/Datathon/data/scoring.xlsx'
scoring = pd.read_excel(scoring_path)

submission_format_path = '/Users/teymour/Desktop/Datathon/data/submission_format.csv'
submission_format = pd.read_csv(submission_format_path)

training_path = '/Users/teymour/Desktop/Datathon/data/training.xlsx'
training = pd.read_excel(training_path).dropna()

# new data from Fred: https://fred.stlouisfed.org/series/CUSR0000SETA02
cpi_used_cars_path = '/Users/teymour/Desktop/Datathon/data/CPI_UsedCars_US.xlsx'
cpi_used_cars = pd.read_excel(cpi_used_cars_path, sheet_name='Monthly')
cpi_used_cars.columns = ['Date', 'CPI']
cpi_used_cars['Year'] = cpi_used_cars['Date'].dt.year
average_cpi_by_year = cpi_used_cars.groupby('Year')['CPI'].mean().reset_index().rename(columns={'Year': 'Model Year'})

def forecast_cpi_polynomial(average_cpi_by_year, forecast_periods=5, degree=2):
    average_cpi_by_year = average_cpi_by_year.sort_values(by='Model Year')

    X = average_cpi_by_year[['Model Year']]
    y = average_cpi_by_year['CPI']

    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)

    model = LinearRegression()
    model.fit(X_poly, y)

    last_year = X['Model Year'].max()
    future_years = np.arange(last_year + 1, last_year + 1 + forecast_periods).reshape(-1, 1)

    future_years_poly = poly.transform(future_years)

    future_cpi = model.predict(future_years_poly)

    future_df = pd.DataFrame({
        'Model Year': future_years.flatten(),
        'CPI': future_cpi
    })

    combined_df = pd.concat([average_cpi_by_year, future_df], ignore_index=True)

    return combined_df

average_cpi_by_year = forecast_cpi_polynomial(average_cpi_by_year, 2, 4)

In [3]:
def clean_input_data(data):
    # add the CPI of Used Cars by City Average
    clean_data = data.merge(average_cpi_by_year, on='Model Year', how='left')
    clean_data['Model Year'] = clean_data['Model Year'].astype(str)
    
    # handle nulls
    clean_data = clean_data.replace({"nan": np.nan})
    clean_data['Model Year'] = clean_data['Model Year'].fillna("Missing")

    # fill missing CPI's with average CPI
    clean_data['CPI'] = clean_data['CPI'].fillna(clean_data['CPI'].mean())
    
    return clean_data

In [4]:
clean_training = clean_input_data(training)
clean_scoring = clean_input_data(scoring)

car_data = pd.concat([clean_training, clean_scoring]).reset_index(drop=True)
car_data['Model Year'] = car_data['Model Year'].astype(str)

categorical_cols = car_data.select_dtypes(include=['object', 'category']).columns.tolist()
encoded_car_data =  pd.get_dummies(car_data, columns=categorical_cols)

train_indices = range(0, len(clean_training))
test_indices = range(len(clean_training), len(car_data))

train = car_data.loc[train_indices].copy()
test = car_data.loc[test_indices].copy()

train_encoded = encoded_car_data.loc[train_indices].copy()
test_encoded = encoded_car_data.loc[test_indices].copy()

# Model

In [5]:
def fit_linear_regression(X, y):
    model = LinearRegression()
    model.fit(X, y)

    return model

class ConstantModel:
    # A fallback model that always predicts a constant value
    def __init__(self, value):
        self.value = value  # Store the constant target value

    def predict(self, X):
        #  mimic contant
        if not hasattr(X, '__len__'):  # Ensure X has a length (is iterable)
            return self.value  # Just return the value if X isn't iterable
        return [self.value] * len(X)  # Normal case

def fit_catboost(X, y):
    try:
        model = CatBoostRegressor(
            iterations=100,
            learning_rate=0.05,
            depth=6,
            cat_features=list(X.columns),
            verbose=0
        )

        model.fit(X, y)

        return model
    except:
        singular_value = y.iloc[0] if hasattr(y, 'iloc') else y[0]  # Extract the constant target value

        return ConstantModel(singular_value)


In [13]:
column_options = list(set(train.columns) - {'Date', 'Vehicle Population', 'Vehicle Category'})

basis_prediction_combos = []

for _ in range(1000):
    # Always include 'Vehicle Category'
    basis_columns = ['Vehicle Category']
    
    additional_basis_columns = random.sample(column_options, random.randint(0, 2))
    basis_columns += additional_basis_columns

    remaining_columns = list(set(column_options) - set(additional_basis_columns))
    prediction_columns = random.sample(remaining_columns, random.randint(3, 8 - len(basis_columns)))

    basis_prediction_combos.append((basis_columns, prediction_columns))

### Bayesian hyperparameter optimization framework

In [16]:
def objective(trial, X, y, cat_features):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),                    
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),     
        'border_count': trial.suggest_int('border_count', 32, 255),                  
        'random_strength': trial.suggest_float('random_strength', 0, 1),            
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),  
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),           
        'verbose': 0
    }
    split_idx = int(0.8 * len(X))
    X_train, X_val = X[:split_idx], X[split_idx:]
    y_train, y_val = y[:split_idx], y[split_idx:]

    model = CatBoostRegressor(**params, cat_features=cat_features)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

def optimize_catboost(X, y, cat_features, n_trials=20):
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, X, y, cat_features), n_trials=n_trials, timeout=30)
    best_params = study.best_params

    model = CatBoostRegressor(**best_params, cat_features=cat_features)
    model.fit(X, y, verbose=0)
    return model, best_params

### Basis models workflow

In [17]:
def get_models_by_basis(train_data, train_encoded, basis_columns, prediction_columns, target_col, model_func, encode):

    basis_values = [train_data[col].unique().tolist() for col in basis_columns]
    basis_combinations = [tuple(values) for values in product(*basis_values)]

    cat_features = [col for col in prediction_columns if train_data[col].dtype == 'object']

    if not cat_features:
        cat_features = []

    trained_models = {}
    for basis_key in basis_combinations:
        basis_dict = {col: value for col, value in zip(basis_columns, basis_key)}
        
        basis_indices = train_data[
            train_data[list(basis_dict.keys())].eq(pd.Series(basis_dict)).all(axis=1)
        ].index

        model_data = train_encoded.copy() if encode else train_data.copy()
        X = model_data.loc[basis_indices, prediction_columns]
        y = model_data.loc[basis_indices, target_col]

        if len(X) > 10:  # Sufficient data
            model, params = optimize_catboost(X, y, cat_features)
            safe = True
        else:  # Fallback model for small datasets
            model = model_func(model_data[prediction_columns], model_data[target_col])
            params = None
            safe = False

        trained_models[basis_key] = (model, safe, params)

    return trained_models


def calculate_rmse(test_data, encoded_test_data, trained_models, basis_columns, prediction_columns, target_col, encode):
    all_predictions = []

    for idx, row in test_data.iterrows():
        model_basis_key = tuple(row[basis_columns])
        model_info = trained_models.get(model_basis_key, (None, False, None))
        model, safe, params = model_info

        if model:
            formatted_row = (encoded_test_data if encode else test_data).loc[idx, prediction_columns]
            prediction = model.predict(pd.DataFrame(formatted_row).T)[0]
            actual = row[target_col]

            all_predictions.append({
                'index': idx,
                'prediction': round(prediction, 0),
                'actual': actual,
                'basis_key': model_basis_key,
                'safe': safe
            })

    if all_predictions:
        pred_df = pd.DataFrame(all_predictions)
        rmse = np.sqrt(mean_squared_error(pred_df['actual'], pred_df['prediction']))
    else:
        rmse = None  # No predictions made

    return rmse

### Test and optimize basis subsets

In [20]:
output_path = '/Users/teymour/Desktop/Datathon/results/catboost_optimization.xlsx'

def append_to_excel(df, file_path):
    """
    Appends DataFrame `df` to the Excel file at `file_path`.
    Creates the file if it doesn't exist.
    """
    if not os.path.exists(file_path):
        df.to_excel(file_path, index=False)
    else:
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
            workbook = load_workbook(file_path)
            sheet = workbook.active
            startrow = sheet.max_row
            
            df.to_excel(writer, index=False, header=False, startrow=startrow)

In [None]:
model_func = fit_catboost
column_options = list(set(train.columns) - {'Date', 'Vehicle Population'})
encode = False

def process_basis_prediction_combo(basis_columns, prediction_columns):
    try:
        target_col = 'Vehicle Population'
        
        clean_prediction_columns = (
            [col for col in train_encoded.columns if col.split("_")[0] not in basis_columns and col not in [target_col, 'Date']]
            if encode else prediction_columns
        )
        
        trained_models = get_models_by_basis(train, train_encoded, basis_columns, clean_prediction_columns, target_col, model_func, encode)
        
        rmse = calculate_rmse(test, test_encoded, trained_models, basis_columns, clean_prediction_columns, target_col, encode)

        basis_keys_params = {
            basis_key: params for basis_key, (_, _, params) in trained_models.items()
        }

        results = pd.DataFrame({
            'basis_columns': [basis_columns],
            'basis_keys_used': [list(trained_models.keys())],
            'basis_keys_params': [basis_keys_params],
            'prediction_columns': [prediction_columns],
            'rmse': [rmse]
        })

        print(results) 

        append_to_excel(results, output_path)

        return results

    except Exception as e:
        print(f"Error processing combo {basis_columns}, {prediction_columns}: {e}")
        return pd.DataFrame()

model_results_list = []
with tqdm(desc="Processing Models", total=len(basis_prediction_combos)) as pbar:
    model_results_list = Parallel(n_jobs=-1)(
        delayed(process_basis_prediction_combo)(basis, pred) for basis, pred in basis_prediction_combos
    )
    pbar.update(len(basis_prediction_combos))
