# Preprocessing Pipeline
## Import Statements

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import sklearn
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Reading in the data

In [2]:
cpi_index = pd.read_csv('data/cpi.csv')
cpi_index.head()

Unnamed: 0,Quarter,CPI
0,2021Q1,117.9
1,2021Q2,118.8
2,2021Q3,119.7
3,2021Q4,121.3
4,2022Q1,123.9


In [3]:
df = pd.read_csv('data/data.csv', encoding = 'windows-1252')
df.head()

  df = pd.read_csv('data/data.csv', encoding = 'windows-1252')


Unnamed: 0,FCID,BookingID,BCreatedDateAEST,BStatusAfterSubmitted,BStatusFromDateTimeAEST,BStatusFinal,BTicketID,BTicketType,TaskName,IsCustomService,IsCustomRepair,IsDeleted,PriceIncGSTRaw,VYMM,VMakeModel,VMake,VYear,VMid,VRego,BShopID,BShopPostcode,BShopState,BShopRegionName,BShopRegionClass
0,2,1449822,2024-02-19,33. Approved,2024-02-19 16:01:15,64. Completed,2323471,Log,"Logbook Service - 75,000 km / 60 months",0,0,False,464.27,2019 AUDI A5 COUPE,AUDI A5 COUPE,AUDI,2019.0,AUD45780,YPG67B,17177,2606,ACT,Canberra,2.0
1,2,1565721,2024-03-19,33. Approved,2024-03-19 13:56:15,64. Completed,2537851,Capped,"Capped Price Service - 10,000 km",0,0,False,290.0,2022 TOYOTA HILUX,TOYOTA HILUX,TOYOTA,2022.0,TOY47062,1HSB467,1133,6017,WA,Perth,2.0
2,1,1320451,2023-11-13,33. Approved,2023-11-14 13:55:17,64. Completed,2053300,ProductTyreOp,((Products)),0,0,False,60.5,2001 NISSAN PATROL,NISSAN PATROL,NISSAN,2001.0,NIS29550,DE18YN,14790,2263,NSW,Central Coast,3.0
3,1,1320451,2023-11-13,33. Approved,2023-11-14 13:55:17,64. Completed,2053301,ProductTyre,((Products)),0,0,False,1484.0,2001 NISSAN PATROL,NISSAN PATROL,NISSAN,2001.0,NIS29550,DE18YN,14790,2263,NSW,Central Coast,3.0
4,2,1448252,2024-02-19,33. Approved,2024-02-19 11:30:01,64. Completed,2320054,Capped,"Capped Price Service - 105,000 km",0,0,False,358.0,2019 HYUNDAI SANTA FE,HYUNDAI SANTA FE,HYUNDAI,2019.0,HYU45824,1RE6XD,17962,3844,VIC,Traralgon,2.0


## Feature Engineering
1. Create Date Column
2. Create Adjusted Price Column

In [4]:
# 1. Create date column
df['BCreatedDateAEST'] = pd.to_datetime(df['BCreatedDateAEST'],  dayfirst=True);
df['Date'] = df['BCreatedDateAEST'].dt.date

  df['BCreatedDateAEST'] = pd.to_datetime(df['BCreatedDateAEST'],  dayfirst=True);


In [5]:
# 2. Price Adjustment
df = df[df['PriceIncGSTRaw'] != 0]
df['Date'] = pd.to_datetime(df['Date']) # ensure 'date' is in datetime format
df['Quarter'] = df['Date'].dt.to_period('Q').astype(str)
df = df.merge(cpi_index, on='Quarter', how='left')
base_cpi = cpi_index['CPI'].iloc[-1]
df['AdjustedPrice'] = round(df['PriceIncGSTRaw'] * (base_cpi / df['CPI']), 2) # round to 2 decimal places for consistency
df['AdjustedPrice'] = round(df['AdjustedPrice'].fillna(df['PriceIncGSTRaw'])) # use original price where CPI is missing (for current quarter which does not need adjustment)


In [6]:
print("Quarters in df:", df['Quarter'].unique())
print("Quarters in cpi_index:", cpi_index['Quarter'].unique())

Quarters in df: ['2024Q1' '2023Q4' '2023Q3' '2023Q2' '2023Q1' '2024Q2' '2020Q4' '2021Q2'
 '2021Q3' '2021Q1' '2021Q4' '2022Q1' '2022Q2' '2022Q3' '2022Q4' '2024Q3'
 '2024Q4' '2025Q1' '2025Q2']
Quarters in cpi_index: ['2021Q1' '2021Q2' '2021Q3' '2021Q4' '2022Q1' '2022Q2' '2022Q3' '2022Q4'
 '2023Q1' '2023Q2' '2023Q3' '2023Q4' '2024Q1' '2024Q2' '2024Q3' '2024Q4'
 '2025Q1']


## Data Cleaning
1. Remove Columns that will not be used 
2. Remove duplicate Rows
3. Remove false negatives

In [7]:
# Step 1: Drop unnecessary columns
columns_to_drop = ['FCID', 'BCreatedDateAEST', 'CPI', 'Quarter', 'PriceIncGSTRaw' , 'VRego', 'BShopID', 'BShopRegionClass', 'BShopRegionName', 'BShopState', 'BShopPostcode', 'IsDeleted', 'BStatusFromDateTimeAEST', 'Date', 'BStatusFinal']
df.drop(columns = columns_to_drop, inplace=True)
df.head()

Unnamed: 0,BookingID,BStatusAfterSubmitted,BTicketID,BTicketType,TaskName,IsCustomService,IsCustomRepair,VYMM,VMakeModel,VMake,VYear,VMid,AdjustedPrice
0,1449822,33. Approved,2323471,Log,"Logbook Service - 75,000 km / 60 months",0,0,2019 AUDI A5 COUPE,AUDI A5 COUPE,AUDI,2019.0,AUD45780,475.0
1,1565721,33. Approved,2537851,Capped,"Capped Price Service - 10,000 km",0,0,2022 TOYOTA HILUX,TOYOTA HILUX,TOYOTA,2022.0,TOY47062,297.0
2,1320451,33. Approved,2053300,ProductTyreOp,((Products)),0,0,2001 NISSAN PATROL,NISSAN PATROL,NISSAN,2001.0,NIS29550,63.0
3,1320451,33. Approved,2053301,ProductTyre,((Products)),0,0,2001 NISSAN PATROL,NISSAN PATROL,NISSAN,2001.0,NIS29550,1534.0
4,1448252,33. Approved,2320054,Capped,"Capped Price Service - 105,000 km",0,0,2019 HYUNDAI SANTA FE,HYUNDAI SANTA FE,HYUNDAI,2019.0,HYU45824,367.0


In [8]:
# Step 2: Remove Duplicate Rows
# Duplicates can skew analysis and lead to incorrect conclusions, thus it is important each row is unique. Duplicates in this dataset are considered as rows that contain the same values across all columns other than the ID columns 
print(f"size before: {df.shape}" )
df = df.drop_duplicates(subset=[col for col in df.columns if col not in ['BookingID', 'BTicketID']])
print(f"size after: {df.shape}" )

size before: (1284505, 13)
size after: (909609, 13)


In [None]:
# Step 3: Remove false negatives
# False negatives are rows which have a status of '16. Requires Changes' when they should have a status of '33. Approved'. This occurs in the dataset as bookings contain multiple tickets. If a single ticket in a booking requires changes, the entire booking is marked as 'Requires Changes', thus marking tickets that do not require changes incorrectly. These false negatives can be detected and removed by checking if a duplicate entry exists where only the status changes.
print(f"size before: {df.shape}" )
df['StatusPriority'] = df['BStatusAfterSubmitted'].apply(lambda x: 0 if x == '33. Approved' else 1) # Assign priority: approved gets highest priority (lowest number)
dedup_cols = [col for col in df.columns if col not in ['BStatusAfterSubmitted', 'StatusPriority']] # Define columns to check for duplicates
df = df.sort_values(by=dedup_cols + ['StatusPriority']) # Sort so approved status is first
df = df.drop_duplicates(subset=dedup_cols, keep='first') # Keep the first occurrence (which is the approved status)
df = df.drop(columns='StatusPriority') # drop the temporary column used for sorting
df['Label'] = df['BStatusAfterSubmitted'].map({'33. Approved': 1, '16. Requires Changes': 0, '29. Rejected': 0}) # create label column for model training
df = df.drop(columns=['BStatusAfterSubmitted'], axis=1) # drop the original status column as it is no longer needed
print(f"size after: {df.shape}" )

size before: (909609, 13)
size after: (808690, 13)


In [10]:
df.head()

Unnamed: 0,BookingID,BTicketID,BTicketType,TaskName,IsCustomService,IsCustomRepair,VYMM,VMakeModel,VMake,VYear,VMid,AdjustedPrice,Label
62582,369142,562377,Repair,Custom Repair,0,1,2010 NISSAN X-TRAIL,NISSAN X-TRAIL,NISSAN,2010.0,,300.0,0
70344,370159,563762,Repair,Custom Repair,0,1,2018 MAZDA CX-3,MAZDA CX-3,MAZDA,2018.0,,282.0,1
70345,370159,563763,Repair,Custom Repair,0,1,2018 MAZDA CX-3,MAZDA CX-3,MAZDA,2018.0,,110.0,1
60447,370161,563767,Repair,Front Driver Side Window Regulator Replacement,0,0,2012 HYUNDAI I20,HYUNDAI I20,HYUNDAI,2012.0,HYU32872,347.0,1
51861,370164,563785,OtherTicket,((Products)),0,0,2017 TOYOTA HILUX,TOYOTA HILUX,TOYOTA,2017.0,TOY39673,51.0,1
