# Decision Tree

## Import Statements

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, classification_report

## Data Import

In [2]:
df = pd.read_csv('data/preprocessed_data.csv')
df.head()
(df['Label'] == 1).sum() / (df['Label'] == 0).sum()


np.float64(5.987466197364467)

## Extract Features and Labels

In [3]:
X = df.drop(columns=['AdjustedPrice', 'Label']) # Removing price and label features
y = df['AdjustedPrice'] # Predict Price

## Train Val Test Split

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Encoding Data

In [5]:
train_features = ['VMakeModel', 'VYear']
eval_features = ['VMakeModel', 'VYear']

# Preprocessing Columns
categorical_cols = ['VMakeModel', 'VMake'] #One Hot Encoding
numerical_cols = ['VYear', 'Distance', 'Months'] # Standardising 

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('scaler', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

## Create and Training the Model

In [6]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

pipeline.fit(X_train, y_train) # Training pipeline on training data

y_train_pred = pipeline.predict(X_train) # Predicting training data for baseline

## Evaluation

In [None]:
# Predicting validation data to calculate anomaly threshold
y_val_pred_prices = pipeline.predict(X_val)
residuals = np.abs(y_val - y_val_pred_prices)

# Anomaly Threshold Calculation
anomaly_threshold = np.percentile(residuals, 95) # Threshold determined by 95th percentile
# mean_residual = np.mean(residuals)
# std_residual = np.std(residuals)
# anomaly_threshold = mean_residual + (2 * std_residual)
print(f"Anomaly Threshold: {anomaly_threshold:.2f}")

# Appling anomaly detection on the test data and evaluating
y_test_pred_prices = pipeline.predict(X_test)
y_pred_anomaly_labels = (np.abs(y_test - y_test_pred_prices) > anomaly_threshold).astype(int)

# Using the Labels from the test set for evaluation
y_true_labels = df.loc[y_test.index, 'Label']


Anomaly Threshold: 452.17


In [8]:
# Applying the anomaly detection on the test data and evaluating
y_test_pred_prices = pipeline.predict(X_test)
y_pred_anomaly_labels = (np.abs(y_test - y_test_pred_prices) > anomaly_threshold).astype(int)

# Using the Labels from the test set for evaluation
y_true_labels = df.loc[y_test.index, 'Label']

print("Anomaly Detection Evaluation")
print("Confusion Matrix:")
print(confusion_matrix(y_true_labels, y_pred_anomaly_labels))

print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred_anomaly_labels))

Anomaly Detection Evaluation
Confusion Matrix:
[[ 2129   159]
 [13618   373]]

Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.93      0.24      2288
           1       0.70      0.03      0.05     13991

    accuracy                           0.15     16279
   macro avg       0.42      0.48      0.14     16279
weighted avg       0.62      0.15      0.08     16279



Overall model performs poorly as expected, Overfits alot due to not picking up complex details