# Preprocessing Pipeline

## Import Statements

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

## Functions

In [2]:
def remove_outliers(df, group_by="TaskName", value_col="price"):
    def filter_group(group):
        Q1 = group[value_col].quantile(0.25)
        Q3 = group[value_col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return group[(group[value_col] >= lower) & (group[value_col] <= upper)]
    
    return df.groupby(group_by, group_keys=False).apply(filter_group)

In [3]:
def drop_rare(df, column, threshold, df_name="DataFrame"):
    print(f"----Dropping Rare {column}s from {df_name}----")
    print(f"{column}s before: {len(df[column].unique())}")
    counts = df[column].value_counts()
    rare_values = counts[counts < threshold].index
    df = df[~df[column].isin(rare_values)].copy()
    print(f"{column}s after: {len(df[column].unique())}")
    return df

## Read in Data

In [4]:
# CPI for price adjustment
cpi_index = pd.read_csv('data/cpi_data.csv')
cpi_index.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/cpi_data.csv'

In [None]:
# Booking/Ticket data
df = pd.read_csv('data/ticket_data.csv', encoding = 'windows-1252')
df.head()

## Rename Columns

In [None]:
df = df.rename(columns={
    'cVMake': 'Make',
    'cVMakeModel': 'Model',
    'cVYear': 'Year',
    'idFuel': 'FuelType',
    'idLitres': 'EngineSize',
    'idTransmission': 'Transmission',
    'idDrive': 'DriveType',
    'idIsHybrid': 'IsHybrid',
    'BOdoNum' : 'Odometer',
    })

## Drop Columns that will not be used

In [None]:
columns_to_drop = ['ï»¿FCID', 'BOdoText', 'VMid']
df.drop(columns = columns_to_drop, inplace=True)
df.head()

## Remove Rows


In [None]:
# Remove custom services and repairs
df = df[df['IsCustomService'] != '1']
df = df[df['IsCustomRepair'] != '1']
df = df[df['TaskName'] != 'Custom Repair']

# Remove products and tyres as they are unpredictable without quantity information 
df = df[df['TaskName'] != '((Products))']
df = df[df['TaskName'] != '((Tyres))']
df = df[df['TaskName'] != 'Tyre Replacement']

# Remove Rare ticket types
df = df[df['BTicketType'] != 'OtherTicket']
df = df[df['BTicketType'] != 'Custom']
df = df[df['BTicketType'] != 'Basic']

# Remove Tickets with zero or negative prices 
df = df[df['PriceIncGSTRaw'] > 0]

## Create Months and Kilometer Columns from TicketType

In [None]:
# Create Columns
df['Distance'] = None
df['Months'] = None

mask = df['BTicketType'] == 'Log'
split_1 = df.loc[mask, 'TaskName'].str.split(' - ', n=1, expand=True)
df.loc[mask, 'TaskName'] = split_1[0]
df.loc[mask, 'Rest'] = split_1[1].fillna('')

split_2 = df.loc[mask, 'Rest'].str.split(' / ', n=1, expand=True)
df.loc[mask, 'Distance'] = split_2[0]
df.loc[mask, 'Months'] = split_2[1]

mask = df['BTicketType'] == 'Capped'
split_1 = df.loc[mask, 'TaskName'].str.split(' - ', n=1, expand=True)
df.loc[mask, 'TaskName'] = split_1[0]
df.loc[mask, 'Distance'] = split_1[1].fillna('')

mask = df['BTicketType'] == 'Prescribed'
split_1 = df.loc[mask, 'TaskName'].str.split(' - ', n=1, expand=True)
df.loc[mask, 'TaskName'] = split_1[0]
df.loc[mask, 'Distance'] = split_1[1].fillna('')

df = df.drop(columns=['Rest'])

In [None]:
# Format as Numeric
df['Months'] = df['Months'].astype(str).str.extract(r'(\d+)')
df['Months'] = pd.to_numeric(df['Months'], errors='coerce')

df['Distance'] = df['Distance'].astype(str).str.replace(',', '', regex=False)  
df['Distance'] = df['Distance'].astype(str).str.extract(r'(\d+)')
df['Distance'] = pd.to_numeric(df['Distance'], errors='coerce')

df.head()

## Create Adjusted Price based on CPI

In [None]:
# 1. Create date column
df['BCreatedDateAEST'] = pd.to_datetime(df['BCreatedDateAEST'], format='mixed', dayfirst=True);
df['Date'] = df['BCreatedDateAEST'].dt.date

In [None]:
# 2. Price Adjustment
df = df[df['PriceIncGSTRaw'] != 0]
df['Date'] = pd.to_datetime(df['Date']) # ensure 'date' is in datetime format
df['Quarter'] = df['Date'].dt.to_period('Q').astype(str)
df = df.merge(cpi_index, on='Quarter', how='left')
base_cpi = cpi_index['CPI'].iloc[-1]
df['AdjustedPrice'] = round(df['PriceIncGSTRaw'] * (base_cpi / df['CPI']), 2) # round to 2 decimal places for consistency
df['AdjustedPrice'] = round(df['AdjustedPrice'].fillna(df['PriceIncGSTRaw'])) # use original price where CPI is missing (for current quarter which does not need adjustment)

In [None]:
df.head()

## Remove Columns that will not be used

In [None]:
columns_to_drop = [ 'BCreatedDateAEST', 'CPI', 'Quarter', 'PriceIncGSTRaw' , 'BShopID', 'BShopRegionName', 'BShopPostcode', 'IsDeleted', 'BStatusFromDateTimeAEST', 'Date', 'BStatusFinal', ]
df.drop(columns = columns_to_drop, inplace=True)
df.head()

## Remove Duplicate Rows

In [None]:
# Duplicates can skew analysis and lead to incorrect conclusions, thus it is important each row is unique. Duplicates in this dataset are considered as rows that contain the same values across all columns other than the ID columns 
print(f"size before: {df.shape}" )
df = df.drop_duplicates(subset=[col for col in df.columns if col not in ['BookingID', 'BTicketID', 'BCreatedDateAEST', 'BStatusFromDateTimeAEST']])
print(f"size after: {df.shape}" )

## Remove False Negatives 

In [None]:
# False negatives are rows which have a status of '16. Requires Changes' when they should have a status of '33. Approved'. This occurs in the dataset as bookings contain multiple tickets. If a single ticket in a booking requires changes, the entire booking is marked as 'Requires Changes', thus marking tickets that do not require changes incorrectly. These false negatives can be detected and removed by checking if a duplicate entry exists where only the status changes.
print(f"size before: {df.shape}" )
df['StatusPriority'] = df['BStatusAfterSubmitted'].apply(lambda x: 0 if x == '33. Approved' else 1) # Assign priority: approved gets highest priority (lowest number)
dedup_cols = [col for col in df.columns if col not in ['BStatusAfterSubmitted', 'StatusPriority','BCreatedDateAEST', 'BStatusFromDateTimeAEST', 'Date' ]] # Define columns to check for duplicates
df = df.sort_values(by=dedup_cols + ['StatusPriority']) # Sort so approved status is first
df = df.drop_duplicates(subset=dedup_cols, keep='first') # Keep the first occurrence (which is the approved status)
df = df.drop(columns='StatusPriority') # drop the temporary column used for sorting

df['Label'] = df['BStatusAfterSubmitted'].map({'33. Approved': 1, '16. Requires Changes': 0, '29. Rejected': 0}) # create label column for model training
df = df.drop(columns=['BStatusAfterSubmitted'], axis=1) # drop the original status column as it is no longer needed
print(f"size after: {df.shape}" ) 

## Remove Rejected Cases

In [None]:
df = df[df['Label'] == 1];

## Fill in missing distances with odometer

In [None]:
df["Distance"] = df["Distance"].fillna(df["Odometer"])

## Final Column Removal

In [None]:

columns_to_drop = ['BookingID', 'BTicketID', 'IsCustomService', 'IsCustomRepair', 'BShopRegionClass', 'BShopState', 'IsHybrid', 'Label', 'Odometer']
df.drop(columns = columns_to_drop, inplace=True)
df.head()

## Split data Frame by Ticket Type

In [None]:
df_repair = df[df['BTicketType'] == 'Repair'].drop(columns=['BTicketType'])
df_log = df[df['BTicketType'] == 'Log'].drop(columns=['BTicketType'])
df_capped = df[df['BTicketType'] == 'Capped'].drop(columns=['BTicketType'])
df_prescribed = df[df['BTicketType'] == 'Prescribed'].drop(columns=['BTicketType'])

## Dropping Rare Cases

In [None]:
df_repair = drop_rare(df_repair, 'TaskName', 100, df_name="Repair DF")
df_log = drop_rare(df_log, 'Model', 20, df_name="Log DF")
df_capped = drop_rare(df_capped, 'Model', 20, df_name="Capped DF")
df_prescribed = drop_rare(df_prescribed, 'Model', 20, df_name="Prescribed DF")

In [None]:
import os

# Define paths
paths = {
    "repair": "../backend/data/preprocessed_repair_data.csv",
    "log": "../backend/data/preprocessed_log_data.csv",
    "capped": "../backend/data/preprocessed_capped_data.csv",
    "prescribed": "../backend/data/preprocessed_prescribed_data.csv",
}

# Make sure the directories exist
for path in paths.values():
    dir_path = os.path.dirname(path)
    os.makedirs(dir_path, exist_ok=True)  

# Save CSVs
df_repair.to_csv(paths["repair"], index=False)
df_log.to_csv(paths["log"], index=False)
df_capped.to_csv(paths["capped"], index=False)
df_prescribed.to_csv(paths["prescribed"], index=False)

print("All files saved successfully:")
for name, path in paths.items():
    print(f"{name}: {os.path.abspath(path)}")