In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # Added for quick visual checks

# Set pandas to display all columns so you can see your changes
pd.set_option('display.max_columns', None)

In [None]:
# REPLACE these filenames with your actual file paths
df = pd.read_csv('FEG_Main_Dataset.csv') 
df_abs = pd.read_csv('ABS_Industry_Wages.csv') 

print(f"Main Dataset Shape: {df.shape}")
print(f"ABS Dataset Shape: {df_abs.shape}")

# Debug: Check the data types. 
# If 'IP Weekly Wage' says 'object', that is why your code failed earlier.
print("\n--- Data Types Check ---")
print(df[['IP Weekly Wage', 'Industry']].dtypes)

In [None]:
# 1. Clean Main Dataset 'IP Weekly Wage'
# Convert to string -> remove '$' and ',' -> Convert to Numeric
df['IP Weekly Wage'] = df['IP Weekly Wage'].astype(str).str.replace(r'[$,]', '', regex=True)
df['IP Weekly Wage'] = pd.to_numeric(df['IP Weekly Wage'], errors='coerce')

# 2. Clean ABS Dataset Wage Column
# We do the same thing here to ensure we can divide them later
if 'ABS_Average_Weekly_Wage' in df_abs.columns:
    df_abs['ABS_Average_Weekly_Wage'] = df_abs['ABS_Average_Weekly_Wage'].astype(str).str.replace(r'[$,]', '', regex=True)
    df_abs['ABS_Average_Weekly_Wage'] = pd.to_numeric(df_abs['ABS_Average_Weekly_Wage'], errors='coerce')

# Sanity Check: Did it work?
print("New Data Type for IP Weekly Wage:", df['IP Weekly Wage'].dtype)
print("Number of NaN wages (unparseable):", df['IP Weekly Wage'].isna().sum())

In [None]:
target_cols = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']

for col in target_cols:
    if col in df.columns:
        # Ensure it is numeric first (just in case)
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Clip values: < 0 becomes 0, > 1 becomes 1
        df[col] = df[col].clip(lower=0.0, upper=1.0)
        
        # Drop rows where Target is missing (we can't train on these)
        df = df.dropna(subset=[col])

# Sanity Check: Min should be 0.0, Max should be 1.0
print(df[target_cols].describe())

In [None]:
# Convert to datetime
df['IP Commencement Date'] = pd.to_datetime(df['IP Commencement Date'], errors='coerce')
df['IP Termination Date'] = pd.to_datetime(df['IP Termination Date'], errors='coerce')

# Calculate Tenure in Years
df['IP_Tenure_Years'] = (df['IP Termination Date'] - df['IP Commencement Date']).dt.days / 365.25

# Fix Logic: Negative tenure -> 0
df.loc[df['IP_Tenure_Years'] < 0, 'IP_Tenure_Years'] = 0

# Fix Missing: Fill NaN with -1
df['IP_Tenure_Years'] = df['IP_Tenure_Years'].fillna(-1)

# Sanity Check: Look at the distribution
print(df['IP_Tenure_Years'].describe())

In [None]:
if 'Industry' in df.columns:
    # Merge
    df = df.merge(df_abs[['Industry', 'ABS_Average_Weekly_Wage']], on='Industry', how='left')
    
    # Fill missing ABS wages with National Median
    national_median = df_abs['ABS_Average_Weekly_Wage'].median()
    df['ABS_Average_Weekly_Wage'] = df['ABS_Average_Weekly_Wage'].fillna(national_median)
    
    # Create Ratio Feature (Float divided by Float)
    df['IP_Wage_to_ABS_Ratio'] = df['IP Weekly Wage'] / (df['ABS_Average_Weekly_Wage'] + 1e-5)
    
    # Create High Risk Flag
    df['Flag_High_Wage_Deviation'] = (df['IP_Wage_to_ABS_Ratio'] > 1.5).astype(int)
    
    # Drop the raw dollar value
    df.drop(columns=['ABS_Average_Weekly_Wage'], inplace=True)

# Sanity Check: Show the new columns
print(df[['Industry', 'IP Weekly Wage', 'IP_Wage_to_ABS_Ratio']].head())

In [None]:
cols_to_drop = [
    # Claimant Data
    'Claim ID', 'Claim Type', 'Claim Form Received Date', 'Claimant Age', 
    'Service Years At Appointment', 'Job Title', 'Job Duty Description', 
    'Claimant Confident of Amounts Owed', 'Information Held About Owed Entitlements',
    'Claimant Commencement Date', 'Claimant Termination Date', 
    'Claimant Weekly Wage', 'Claimant Annual Leave', 'Claimant Wages',
    
    # CM Recommended (Leakage)
    'CM Recommended Employment Type', 'CM Recommended Weekly Wage', 
    'CM Recommended Commencement Date', 'CM Recommended Termination Date',
    'CM Recommended Annual Leave', 'CM Recommended Long Service Leave', 
    'CM Recommended Wages',
    
    # Raw IP Dates (We have Tenure now)
    'IP Commencement Date', 'IP Termination Date'
]

# Only drop cols that exist
existing_cols = [c for c in cols_to_drop if c in df.columns]
df.drop(columns=existing_cols, inplace=True)

# Sanity Check: Check if any 'CM Recommended' columns remain
leaking = [c for c in df.columns if 'CM Recommended' in c]
print("Leaking columns remaining (Should be empty):", leaking)

In [None]:
# 1. Outliers
days_col = 'Days Between IP Verified Data Request and Received Date'
if days_col in df.columns:
    df = df[df[days_col] >= 0] # Drop negative days
    limit = df[days_col].quantile(0.99)
    df.loc[df[days_col] > limit, days_col] = limit

# 2. Final Imputation
for col in df.columns:
    if df[col].isna().sum() > 0:
        # Create indicator
        df[f'{col}_missing'] = df[col].isna().astype(int)
        
        # Fill values
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].median())
        else:
            mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
            df[col] = df[col].fillna(mode_val)

print("Final Data Shape:", df.shape)
print("Cleaning Complete.")