# Decision Tree

## Import Statements

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer 
import math

## Data Import

In [2]:
df = pd.read_csv('../Final Model/data/preprocessed_data.csv', encoding='windows-1252')
df.head()
#(df['Label'] == 1).sum() / (df['Label'] == 0).sum()

FileNotFoundError: [Errno 2] No such file or directory: '../Final Model/data/preprocessed_data.csv'

In [None]:
df_repair = df[df['BTicketType'] == 'Repair'].copy() # Repair Only
df_other = df[df['BTicketType'].isin(['Log', 'Capped', 'Prescribed'])].copy() # Combined

df_repair = df_repair.drop(columns=['BTicketType']) # Irrelevant for model
df_repair = df_repair[df_repair['AdjustedPrice'] != 0].copy()
df_other = df_other.drop(columns=['BTicketType']) 


In [None]:
print(f"Shape of the 'Repair' DataFrame: {df_repair.shape}")
print(f"Shape of the 'Other' DataFrame: {df_other.shape}")
df_repair.head()

Shape of the 'Repair' DataFrame: (328444, 7)
Shape of the 'Other' DataFrame: (271682, 7)


Unnamed: 0,VMakeModel,VMake,VYear,Distance,Months,AdjustedPrice,Label
0,HYUNDAI I20,HYUNDAI,2012.0,,,347.0,1
3,NISSAN NAVARA,NISSAN,2018.0,,,135.0,1
4,MAZDA 3,MAZDA,2015.0,,,140.0,1
9,TOYOTA HIACE,TOYOTA,2015.0,,,41.0,1
10,TOYOTA HIACE,TOYOTA,2015.0,,,17.0,1


## Extract Features and Labels

In [None]:
X_repair = df_repair.drop(columns=['AdjustedPrice', 'Label']) # Repairs
y_repair = df_repair['AdjustedPrice']

X_other = df_other.drop(columns=['AdjustedPrice', 'Label']) # Other
y_other = df_other['AdjustedPrice']

## Train Val Test Split

In [None]:
X_repair_train, X_repair_temp, y_repair_train, y_repair_temp = train_test_split(X_repair, y_repair, test_size=0.2, random_state=42) # Splitting Repairs
X_repair_val, X_repair_test, y_repair_val, y_repair_test = train_test_split(X_repair_temp, y_repair_temp, test_size=0.5, random_state=42)

X_other_train, X_other_temp, y_other_train, y_other_temp = train_test_split(X_other, y_other, test_size=0.2, random_state=42) # Splitting Other
X_other_val, X_other_test, y_other_val, y_other_test = train_test_split(X_other_temp, y_other_temp, test_size=0.5, random_state=42)

## Encoding Data

In [None]:
# train_features = ['VMakeModel', 'VYear']
# eval_features = ['VMakeModel', 'VYear']

# Preprocessing Columns
categorical_cols_repair = ['VMakeModel', 'VMake'] # OneHotEncoding
numerical_cols_repair = ['VYear', 'Distance', 'Months'] # Standardising 

categorical_cols_other = ['VMakeModel', 'VMake']
numerical_cols_other = ['VYear', 'Distance', 'Months']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_repair = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_repair),
        ('cat', categorical_transformer, categorical_cols_repair)
    ],
    remainder='passthrough'
)

preprocessor_other = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_other),
        ('cat', categorical_transformer, categorical_cols_other)
    ],
    remainder='passthrough'
)



In [None]:
# Model pipelines
dt_pipeline_repair = Pipeline(steps=[
    ('preprocessor', preprocessor_repair),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

dt_pipeline_other = Pipeline(steps=[
    ('preprocessor', preprocessor_other),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# Defining gridsearch parameter
param_grid_dt = {
    'regressor__max_depth': [None, 5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2', None]
}

# Search for repair model
random_search_dt = RandomizedSearchCV(
    dt_pipeline_repair,
    param_distributions=param_grid_dt,
    n_iter=20,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Search for the Other model
random_search_other = RandomizedSearchCV(
    dt_pipeline_other,
    param_distributions=param_grid_dt,
    n_iter=20,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)



## Create and Training the Model

In [None]:
# Fitting parameters
random_search_dt.fit(X_repair_train, y_repair_train)
dt_pipeline_repair = random_search_dt.best_estimator_
random_search_other.fit(X_other_train, y_other_train)
dt_pipeline_other = random_search_other.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits




Fitting 5 folds for each of 20 candidates, totalling 100 fits


## Evaluation

In [None]:
# Predictions
y_test_pred_repair = dt_pipeline_repair.predict(X_repair_test)
y_test_pred_other = dt_pipeline_other.predict(X_other_test)

# Evaluation of Repair Model
test_r2_repair = r2_score(y_repair_test, y_test_pred_repair)
test_mape_repair = np.mean(np.abs((y_repair_test - y_test_pred_repair) / y_repair_test))
test_rmse_repair = math.sqrt(mean_squared_error(y_repair_test, y_test_pred_repair))

# Evaluation of Other Model
test_r2_other = r2_score(y_other_test, y_test_pred_other)
test_mape_other = np.mean(np.abs((y_other_test - y_test_pred_other) / y_other_test))
test_rmse_other = math.sqrt(mean_squared_error(y_other_test, y_test_pred_other))


print("Repairs")
print(f"Test R-squared: {test_r2_repair:.2f}")
print(f"Test MAPE: {test_mape_repair:.2f}")
print(f"Test RMSE: {test_rmse_repair:.2f}\n")

print("Log, Capped, Prescribed Tickets")
print(f"Test R-squared: {test_r2_other:.2f}")
print(f"Test MAPE: {test_mape_other:.2f}")
print(f"Test RMSE: {test_rmse_other:.2f}\n")


Repairs
Test R-squared: 0.02
Test MAPE: 3.07
Test RMSE: 910.39

Log, Capped, Prescribed Tickets
Test R-squared: 0.49
Test MAPE: 0.24
Test RMSE: 233.64



