# Feature Engineering

In [None]:
# Necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import itertools
import time
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import lightgbm as lgb

In [None]:
# Load and process data
df_cars = pd.read_csv('cleaned_data_july_21st.csv')

# Define possible features to be used in model combinations
possible_features = ['Year', 'Model', 'State', 'Mileage', 'Trim', 'Make', 'Body Style', 'City']

print('Data loaded & processed')

Data loaded & processed


## Test All Feature Combinations

In [None]:
def evaluate_feature_combinations(df_cars, target, possible_features):
    """
    Train and evaluate a LightGBM model using different combinations of features.
    """
    best_results = {}

    # Iterate over all possible feature combos
    for r in range(2, len(possible_features) + 1):
        best_result = None

        for combo in itertools.combinations(possible_features, r):
            combo_list = list(combo)

            # Specify your features and target
            X = df_cars[combo_list].copy()
            y = df_cars[target]

            # Specify categorical features
            categorical_features = [col for col in combo_list if col in ['Model', 'State', 'Trim', 'Make', 'Body Style', 'City']]
            
            # Converting categorical features to 'category' dtype
            X[categorical_features] = X[categorical_features].astype('category')

            # Scale numerical features if they are in the combo
            numerical_cols = [col for col in ['Year', 'Mileage'] if col in combo_list]
            if numerical_cols:
                scaler = StandardScaler()
                X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

            # Split the data
            train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)

            # Track training time
            start_time = time.time()

            # Train LightGBM model on current feature combo
            lightgbm_model = lgb.LGBMRegressor(n_jobs=-1, verbose=-1)
            lightgbm_model.fit(train_X, train_y, categorical_feature=categorical_features)

            # Predict on test data
            pred_lightgbm = lightgbm_model.predict(test_X)
            mae = metrics.mean_absolute_error(test_y, pred_lightgbm)
            mse = metrics.mean_squared_error(test_y, pred_lightgbm)
            r2 = metrics.r2_score(test_y, pred_lightgbm)

            end_time = time.time()
            elapsed_time = (end_time - start_time)

            # Check if this is the best result for the current feature count
            if best_result is None or r2 > best_result['R2']:
                best_result = {
                    'features': combo_list,
                    'MAE': mae,
                    'MSE': mse,
                    'R2': r2,
                    'time': elapsed_time
                }

        # Store the best result for this feature count
        if best_result:
            best_results[r] = best_result

    return best_results

# Run feature combination evaluation
best_results = evaluate_feature_combinations(df_cars, 'Price', possible_features)

# Print the best feature combinations for each feature count
for feature_count, result in best_results.items():
    print(f"\033[1mBest LightGBM Performance for {feature_count} Features:\033[0m")
    print(f"Features: {result['features']}")
    print(f'Mean Absolute Error (MAE): $ {result["MAE"]:.2f}')
    print(f'Mean Squared Error  (MSE): {int(result["MSE"]):,}')
    print(f'R2 Score             (R2): {result["R2"]:.4f}')
    print(f"Execution Time: {result['time']:.1f} seconds\n")

[1mBest LightGBM Performance for 2 Features:[0m
Features: ['Year', 'Model']
Mean Absolute Error (MAE): $ 3195.93
Mean Squared Error  (MSE): 23,809,094
R2 Score             (R2): 0.8633
Execution Time: 0.2 seconds

[1mBest LightGBM Performance for 3 Features:[0m
Features: ['Year', 'Model', 'Trim']
Mean Absolute Error (MAE): $ 2382.08
Mean Squared Error  (MSE): 12,685,900
R2 Score             (R2): 0.9271
Execution Time: 0.2 seconds

[1mBest LightGBM Performance for 4 Features:[0m
Features: ['Year', 'Model', 'Mileage', 'Trim']
Mean Absolute Error (MAE): $ 1789.99
Mean Squared Error  (MSE): 8,485,671
R2 Score             (R2): 0.9513
Execution Time: 0.2 seconds

[1mBest LightGBM Performance for 5 Features:[0m
Features: ['Year', 'Model', 'Mileage', 'Trim', 'Body Style']
Mean Absolute Error (MAE): $ 1796.36
Mean Squared Error  (MSE): 8,257,580
R2 Score             (R2): 0.9526
Execution Time: 0.2 seconds

[1mBest LightGBM Performance for 6 Features:[0m
Features: ['Year', 'Model', 

In [None]:

biggest jump in terms of mae and mse comes at 4 features. the returns begin to diminsh after that

## **Conclusion**: 

8 features has the best mae but 7 features has the best mse. Removing city removes nearly 3,000 unique values, and makes my model more efficient. i will apply 7 features to the validation set to test test the resutls to see which model i should optimize. apply to validaiton set will help tell me which model is generalizing the data better. 

In [None]:
# Define features and target
features = ['Year', 'Model', 'State', 'Mileage', 'Trim', 'Make', 'Body Style']
X = df_cars[features].copy()
y = df_cars['Price']

# Specify categorical features
categorical_features = ['Model', 'State', 'Trim', 'Make', 'Body Style']

# Converting categorical features (XGB & LightGBM use category, Catboost uses string)
X[categorical_features] = X[categorical_features].astype('category')

# Scale numerical features
scaler = StandardScaler()
X[['Year', 'Mileage']] = scaler.fit_transform(X[['Year', 'Mileage']])

# Split the data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)

lightgbm_model = lgb.LGBMRegressor(n_jobs=-1, verbose=-1)
lightgbm_model.fit(train_X, train_y, categorical_feature=categorical_features)

# Load validation data
df_validation = pd.read_csv('cleaned_data_aug_16th.csv') 

# Drop irrelevant columns
df_validation.drop(columns=['Listing ID', 'Stock Type'], inplace=True)

# Ensure non-numerical columns are formatted as 'category'
df_validation[categorical_features] = df_validation[categorical_features].astype('category')

# Scale the numerical features with saved scaler
X_validation = df_validation[features].copy()
X_validation[['Year', 'Mileage']] = scaler.transform(X_validation[['Year', 'Mileage']])
X_validation[['Year', 'Mileage']] = X_validation[['Year', 'Mileage']].astype('float64')

# Predict validation data using the loaded model
pred_light = lightgbm_model.predict(X_validation)

# Define validation target variable
y_validation = df_validation['Price'].values

# Print validation data performance
print('\n\033[1m(7 Features) LightGBM Regressor Performance on Validation Data from 8/15:\033[0m')
print(f'Mean Absolute Error (MAE): $ {round(metrics.mean_absolute_error(y_validation, pred_light), 2):,}')
print(f'Mean Squared Error  (MSE): {int(round(metrics.mean_squared_error(y_validation, pred_light))):,}')
print(f'R2 Score             (R2): {round(metrics.r2_score(y_validation, pred_light), 4)}')

print('\n\033[1m(8 Features) LightGBM Regressor Performance on Validation Data from 8/15:')
print('Mean Absolute Error (MAE): $ 1,898.75')
print('Mean Squared Error  (MSE): 10,302,178')
print('R2 Score             (R2): 0.9397')


[1m(7 Features) LightGBM Regressor Performance on Validation Data from 8/15:[0m
Mean Absolute Error (MAE): $ 1,913.13
Mean Squared Error  (MSE): 9,884,403
R2 Score             (R2): 0.9421

[1m(8 Features) LightGBM Regressor Performance on Validation Data from 8/15:
Mean Absolute Error (MAE): $ 1,898.75
Mean Squared Error  (MSE): 10,302,178
R2 Score             (R2): 0.9397


## **Conclusion**:

7 Features still appears to be the best combination. The difference in mae in neglible in comparison to target range and the better mse shows its handling large errors well and likely better at generalizing when introduced to new data. 7 Features also reduces unique values for categroical columns by more than 3,000. 7 features captures the best range of everything. 