#  Import Statements

In [13]:
import pandas as pd
import sklearn
import category_encoders as ce


In [3]:
df = pd.read_csv('data/data.csv')
cpi = pd.read_csv('data/cpi.csv')
df.head()

Unnamed: 0,FCID,BookingID,BCreatedDateAEST,BTicketID,BTicketType,TaskName,IsCustomService,IsCustomRepair,PriceIncGSTRaw,VYMM,VMakeModel,VMake,VYear,BShopID,BShopPostcode,BShopState,BShopRegionName,BShopRegionClass
0,1,463259,17/06/2021,708763,Capped,Capped Price - 30K,0,0,180.0,2019 TOYOTA COROLLA,TOYOTA COROLLA,TOYOTA,2019,17885,4102,QLD,Brisbane,1
1,2,1360052,11/01/2024,2122072,Capped,Capped Price - 50K,0,0,315.9,2021 MAZDA CX-5,MAZDA CX-5,MAZDA,2021,17739,2285,NSW,Lake Macquarie,2
2,1,1058706,19/10/2022,1633633,Repair,Replace Wiper Blades,0,0,120.0,2020 SKODA KAROQ,SKODA KAROQ,SKODA,2020,19138,3192,VIC,Melbourne,1
3,2,1078043,11/11/2022,1664447,Logbook,Logbook - 60K / 48m,0,0,462.1,2018 MITSUBISHI ASX,MITSUBISHI ASX,MITSUBISHI,2018,17667,2170,NSW,Sydney,1
4,2,1868175,30/07/2024,3101426,Capped,Capped Price - 30K,0,0,359.21,2020 MAZDA 3,MAZDA 3,MAZDA,2020,17410,3041,VIC,Melbourne,1


# Preprocessing
## What needs to be done?
- Price adjusted for inflation
- one hot encoding?
- train, val, test split
- Visualise data distributions
- Visualise interclass correlation

In [4]:
# Price adjustment for inflation
df['BCreatedDateAEST'] = pd.to_datetime(df['BCreatedDateAEST']);
df['Quarter'] = df['BCreatedDateAEST'].dt.to_period('Q');
cpi['Quarter'] = pd.to_datetime(cpi['Quarter']).dt.to_period('Q');
base_cpi = cpi['CPI'].iloc[-1]

  df['BCreatedDateAEST'] = pd.to_datetime(df['BCreatedDateAEST']);
  cpi['Quarter'] = pd.to_datetime(cpi['Quarter']).dt.to_period('Q');


In [5]:
merged_df = df.merge(cpi, on='Quarter');
#Round to two decimal places to remain consistent with the existing data
merged_df['adjusted_price'] = round(merged_df['PriceIncGSTRaw'] * (base_cpi / merged_df['CPI']), 2)

In [16]:

pd.set_option('display.max_columns', None)

merged_df.head()

pd.reset_option('display.max_columns')

## Encoding Categoricals

In [None]:
# First extract the number of unique categories (determine cardinality)
# With the cardinality, we can evaluate for which is the most suitable encoding method.

ticketTypeCategories = merged_df['BTicketType'].nunique()
print(f"BTicketType has {ticketTypeCategories} categories") #7

taskNameCategories = merged_df['TaskName'].nunique()
print(f"TaskName has {taskNameCategories} categories") #2721

VYMMCategories = merged_df['VYMM'].nunique()
print(f"VYMM has {VYMMCategories} categories") #5192

VMakeCategories = merged_df['VMake'].nunique()
print(f"VMake has {VMakeCategories} categories") # 360

BShopStateCategories = merged_df['BShopState'].nunique()
print(f"BShopState has {BShopStateCategories} categories") # 8

BShopRegionNameCategories = merged_df['BShopRegionName'].nunique()
print(f"BShopRegionName has {BShopRegionNameCategories} categories") # 119

# Results show that ticketType and BShop state have low cardinality, remaining predictors have high cardinality


ticketType has 7 categories
taskName has 2721 categories
VYMM has 5192 categories
VMake has 360 categories
BShopState has 8 categories
BShopRegionName has 119 categories


In [None]:
encoder = ce.BinaryEncoder(cols=['TaskName', 'VYMM', 'VMake', 'BShopRegionName'])
df_encoded = encoder.fit_transform(df)

df_encoded.head()

#When data is split, ensure that fit is ONLY applied to the train dataset, then proceed to transform both train and test.

In [17]:
encoder = ce.OneHotEncoder(cols=['BTicketType', 'BShopState'], use_cat_names=True)
df_encoded = encoder.fit_transform(df)
df_encoded.head()

Unnamed: 0,FCID,BookingID,BCreatedDateAEST,BTicketID,BTicketType_Capped,BTicketType_Repair,BTicketType_Logbook,BTicketType_Custom,BTicketType_Prescribed,BTicketType_Basic,...,BShopState_NSW,BShopState_VIC,BShopState_ACT,BShopState_WA,BShopState_NT,BShopState_TAS,BShopState_SA,BShopRegionName,BShopRegionClass,Quarter
0,1,463259,2021-06-17,708763,1,0,0,0,0,0,...,0,0,0,0,0,0,0,Brisbane,1,2021Q2
1,2,1360052,2024-01-11,2122072,1,0,0,0,0,0,...,1,0,0,0,0,0,0,Lake Macquarie,2,2024Q1
2,1,1058706,2022-10-19,1633633,0,1,0,0,0,0,...,0,1,0,0,0,0,0,Melbourne,1,2022Q4
3,2,1078043,2022-11-11,1664447,0,0,1,0,0,0,...,1,0,0,0,0,0,0,Sydney,1,2022Q4
4,2,1868175,2024-07-30,3101426,1,0,0,0,0,0,...,0,1,0,0,0,0,0,Melbourne,1,2024Q3
