In [None]:
from lightgbm.sklearn import LGBMRegressor
from preprocessing.preprocessing import load_data
from sklearn.model_selection import KFold
from model.meta_learner import r2_score_oos, convert_to_onnx
import numpy as np
from sklearn import metrics

In [None]:
def train_in_cv(df, target_name, input_list):
    """
    Train a LightGBM regression model using cross-validation and evaluate its performance on various metrics.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the dataset.
    - target_name (str): The name of the target variable to predict.
    - input_list (list): A list containing different metric lists for storing evaluation results.

    Returns:
    List of lists containing evaluation metrics for each fold and the respective subsets of the test data:
    [mse_list, r2_list, mae_list, mse2_list, mse3_list, mse10_list, mae2_list, mae3_list, mae10_list]

    Notes:
    - The function uses LightGBM's 'gbdt' boosting type for regression.
    - The model is trained on each fold's training data and evaluated on the corresponding test data.
    - Subsets of the test data are defined based on target value thresholds and GPS speed conditions.
    """
    #extract input list
    mse_list = input_list[0]
    r2_list = input_list[1]
    mae_list = input_list[2]
    mse2_list = input_list[3]
    mse3_list = input_list[4]
    mse10_list = input_list[5]
    mae2_list = input_list[6]
    mae3_list = input_list[7]
    mae10_list = input_list[8]

    unique_ids = df['driveID'].unique()
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for i, (all_train_ids, test_ids) in enumerate(kfold.split(unique_ids)):
        # get selected driveIDs and split data
        df_train = df[df['driveID'].isin(all_train_ids)].sample(frac=1, random_state=42)
        df_test = df[df['driveID'].isin(test_ids)]
        y_train = df_train[target_name]
        y_test = df_test[target_name]

        # drop irrelevant data
        irrelevant_feature_list = [target_name, "driveID", "target_forward", "timestamp_utc", "groupID", "osmID"]
        irrelevant_feature_list.extend(list(df.columns[df.columns.str.contains('target_')]))
        X_train = df_train.drop(irrelevant_feature_list, axis=1)
        X_test = df_test.drop(irrelevant_feature_list, axis=1)

        # train model
        model = LGBMRegressor(boosting_type='gbdt', objective='regression')
        model.fit(X_train, y_train)
        convert_to_onnx(model, "best_case", target_name)
        # create different subsets for acceleration and speed thresholds
        df_2 = y_test[abs(y_test) > 2].copy()
        df_3 = y_test[abs(y_test) > 3].copy()
        threshold_kmh = 10
        mask = np.array(df_test['gps_speed'] > (threshold_kmh / 3.6))
        df_4 = y_test[mask].copy()

        # run model for validation data
        y_pred = model.predict(X_test)
        y_pred_2 = model.predict(X_test.loc[df_2.index.values.tolist()])
        y_pred_3 = model.predict(X_test.loc[df_3.index.values.tolist()])
        y_pred_4 = model.predict(X_test.loc[df_4.index.values.tolist()])

        # store results
        mse_list.append(metrics.mean_squared_error(y_test, y_pred))
        r2_list.append(r2_score_oos(y_test, y_pred, df_train[target_name].mean()))
        mae_list.append(metrics.mean_absolute_error(y_test, y_pred))
        mse2_list.append(metrics.mean_squared_error(df_2, y_pred_2))
        mse3_list.append(metrics.mean_squared_error(df_3, y_pred_3))
        mse10_list.append(metrics.mean_squared_error(df_4, y_pred_4))
        mae2_list.append(metrics.mean_absolute_error(df_2, y_pred_2))
        mae3_list.append(metrics.mean_absolute_error(df_3, y_pred_3))
        mae10_list.append(metrics.mean_absolute_error(df_4, y_pred_4))

    return [mse_list, r2_list, mae_list, mse2_list, mse3_list, mse10_list, mae2_list, mae3_list, mae10_list]

In [None]:
def train_with_train_test_split(df):
    """
    Train a LightGBM regression model using the train-test split approach and convert it to ONNX format.

    This function trains a LightGBM regression model using the train-test split approach, where the dataset is
    divided into training and testing subsets. The model is trained on the training subset and evaluated on the
    testing subset. After training, the function converts the trained model to ONNX format for interoperability.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the dataset.

    Returns:
    None

    Notes:
    - The target variable for regression is assumed to be "target_forward".
    - Irrelevant features, such as target-related columns and metadata, are removed from the input DataFrame.
    - The model is trained using LightGBM's 'gbdt' boosting type with the 'regression' objective.
    - The trained model is then converted to ONNX format using the 'convert_to_onnx' function.
    """
    target_name = "target_forward"

    # split data
    y = df[target_name]
    irrelevant_feature_list = [target_name, "driveID", "timestamp_utc", "groupID", "osmID"]
    irrelevant_feature_list.extend(list(df.columns[df.columns.str.contains('target_')]))
    X = df.drop(irrelevant_feature_list, axis=1)

    # convert all features to float for onnx
    X = X.astype("float32")

    # train model
    model = LGBMRegressor(boosting_type='gbdt', objective='regression')
    model.fit(X, y)

    # convert to onnx
    #convert_to_onnx(model, "best_case", target_name, len(X.columns))

## Load in Data

In [None]:
df = load_data(save_dir = '/home/stud03/data_science_challenge/data/V4', dropna=False, drop_accel_invalid=True, drop_gyro_invalid=True,drop_gps_invalid=True)
df = df.dropna()
# The next line drops out the GPS positions, comment it out if they should be relevant in training (Step 2)
df = df.drop(columns=['gps_lat', 'gps_long', 'gps_lat_m', 'gps_long_m'])

In [None]:
print(df.shape)
# Count the number of entries for each driveID
drive_counts = df['driveID'].value_counts()

# Get the driveIDs with at least 5 entries
valid_drive_ids = drive_counts[drive_counts >= 10].index

# Filter the DataFrame based on the valid driveIDs
df = df[df['driveID'].isin(valid_drive_ids)]
print(df.shape)

## Train 1 LightGBM Model with all features

In [None]:
target_name = 'target_left'

mse_list = []
r2_list = []
mae_list = []
mse2_list = []
mse3_list =[]
mse10_list = []
mae2_list = []
mae3_list = []
mae10_list = []

input_list = [mse_list, r2_list, mae_list, mse2_list, mse3_list, mse10_list, mae2_list, mae3_list, mae10_list]

result_list = train_in_cv(df, target_name,input_list)


print("MSE: ", np.array(result_list[0]).mean())
print("R^2: ",np.array(result_list[1]).mean())
print("MAE: ",np.array(result_list[2]).mean())
print("MSE a > 2: ",np.array(result_list[3]).mean())
print("MSE a > 3: ",np.array(result_list[4]).mean())
print("MSE s > 10: ",np.array(result_list[5]).mean())
print("MAE a > 2: ",np.array(result_list[6]).mean())
print("MAE a > 3: ",np.array(result_list[7]).mean())
print("MAE s > 10: ",np.array(result_list[8]).mean())

## Train 1 LightGBM Model per GroupID with all features
## The result is the mean of all models

In [None]:
target_name = 'target_forward'

mse_list = []
r2_list = []
mae_list = []
mse2_list = []
mse3_list =[]
mse10_list = []
mae2_list = []
mae3_list = []
mae10_list = []

result_list = [mse_list, r2_list, mae_list, mse2_list, mse3_list, mse10_list, mae2_list, mae3_list, mae10_list]

for i in range(1,10):
    sub_df = df[df["groupID"] == i]
    result_list = train_in_cv(df, target_name, result_list)

print("MSE: ", np.array(result_list[0]).mean())
print("R^2: ",np.array(result_list[1]).mean())
print("MAE: ",np.array(result_list[2]).mean())
print("MSE a > 2: ",np.array(result_list[3]).mean())
print("MSE a > 3: ",np.array(result_list[4]).mean())
print("MSE s > 10: ",np.array(result_list[5]).mean())
print("MAE a > 2: ",np.array(result_list[6]).mean())
print("MAE a > 3: ",np.array(result_list[7]).mean())
print("MAE s > 10: ",np.array(result_list[8]).mean())

### Train with normal Train-Test split

In [None]:
train_with_train_test_split(df)