In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import re

In [2]:
# Load data
df = pd.read_csv("/content/drive/MyDrive/fraud detection/carclaims.csv")

In [3]:
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


In [4]:
df.tail()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
15415,Nov,4,Friday,Toyota,Urban,Tuesday,Nov,5,Male,Married,...,31 to 35,No,No,External,none,no change,1 vehicle,1996,Collision,Yes
15416,Nov,5,Thursday,Pontiac,Urban,Friday,Dec,1,Male,Married,...,31 to 35,No,No,External,more than 5,no change,3 to 4,1996,Liability,No
15417,Nov,5,Thursday,Toyota,Rural,Friday,Dec,1,Male,Single,...,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision,Yes
15418,Dec,1,Monday,Toyota,Urban,Thursday,Dec,2,Female,Married,...,31 to 35,No,No,External,more than 5,no change,1 vehicle,1996,All Perils,No
15419,Dec,2,Wednesday,Toyota,Urban,Thursday,Dec,3,Male,Single,...,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision,Yes


1. Convert range-based strings to numeric

In [5]:
def range_to_numeric(x):

    if pd.isna(x) or str(x).strip().lower() in ["none", "no", "na"]:
        return np.nan

    x = str(x).lower()

    # Extract all numbers
    nums = [float(n) for n in re.findall(r"\d+", x)]

    if len(nums) == 0:
        return np.nan

    # Case 1: Range like "1 to 5", "1-5"
    if len(nums) == 2:
        return sum(nums) / 2

    # Case 2: Single number cases
    n = nums[0]

    if "more than" in x or "or more" in x:
        return n + 1   # conservative bump

    if "less than" in x:
        return max(n - 1, 0)

    # Case 3: Single number with text ("1 year", "2 days")
    return n

In [6]:
range_cols = {
    "Days:Policy-Accident": "PolicyAccidentDays",
    "Days:Policy-Claim": "PolicyClaimDays",
    "PastNumberOfClaims": "PastClaimsNum",
    "AgeOfVehicle": "VehicleAge",
    "AgeOfPolicyHolder": "PolicyHolderAge",
    "NumberOfSuppliments": "NumSuppliments",
    "NumberOfCars": "NumCars",
}

for old, new in range_cols.items():
    df[new] = df[old].apply(range_to_numeric)




In [7]:
print("\nSample of extracted features:")
print(df[['Days:Policy-Accident', 'PolicyAccidentDays',
          'Days:Policy-Claim', 'PolicyClaimDays']].head(10))


Sample of extracted features:
  Days:Policy-Accident  PolicyAccidentDays Days:Policy-Claim  PolicyClaimDays
0         more than 30                31.0      more than 30             31.0
1         more than 30                31.0      more than 30             31.0
2         more than 30                31.0      more than 30             31.0
3         more than 30                31.0      more than 30             31.0
4         more than 30                31.0      more than 30             31.0
5         more than 30                31.0      more than 30             31.0
6         more than 30                31.0      more than 30             31.0
7         more than 30                31.0      more than 30             31.0
8         more than 30                31.0      more than 30             31.0
9         more than 30                31.0      more than 30             31.0


2. Ordinal + binary encoding

In [8]:
df['VehiclePrice'].unique()

array(['more than 69,000', '20,000 to 29,000', '30,000 to 39,000',
       'less than 20,000', '40,000 to 59,000', '60,000 to 69,000'],
      dtype=object)

In [9]:
vehicle_price_map = {
    "less than 20,000": 1,
    "20,000 to 29,000": 2,
    "30,000 to 39,000": 3,
    "40,000 to 59,000": 4,
    "60,000 to 69,000": 5,
    "more than 69,000": 6,
}
df["VehiclePriceOrdinal"] = df["VehiclePrice"].map(vehicle_price_map)

binary_cols = {
    "Sex": {"Male": 1, "Female": 0},
    "Fault": {"Policy Holder": 1, "Third Party": 0},
    "PoliceReportFiled": {"Yes": 1, "No": 0},
    "WitnessPresent": {"Yes": 1, "No": 0},
    "FraudFound": {"Yes": 1, "No": 0},
}

for col, mapping in binary_cols.items():
    df[col] = df[col].map(mapping)

In [10]:
df.rename(columns={"FraudFound": "Target"}, inplace=True)

3. Reduce high-cardinality Make

In [11]:
df["Make"].value_counts()

Unnamed: 0_level_0,count
Make,Unnamed: 1_level_1
Pontiac,3837
Toyota,3121
Honda,2801
Mazda,2354
Chevrolet,1681
Accura,472
Ford,450
VW,283
Dodge,109
Saab,108


In [12]:
top_makes = df["Make"].value_counts().nlargest(10).index
df["Make"] = df["Make"].where(df["Make"].isin(top_makes), "Other")

4. Drop unnecessary columns

In [13]:
drop_cols = [
    *range_cols.keys(),
    "VehiclePrice",
    "PolicyNumber",
    "RepNumber",
    "AddressChange-Claim"
]

df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

5. One-Hot Encoding for Logistic & MLP

In [15]:
categorical_for_onehot = [
    'Month', 'DayOfWeek', 'AccidentArea',
    'DayOfWeekClaimed', 'MonthClaimed', 'MaritalStatus',
    'PolicyType', 'VehicleCategory', 'AgentType', 'BasePolicy', 'Make'
]

# Apply one-hot encoding
df_onehot = pd.get_dummies(df, columns=categorical_for_onehot, drop_first=True)

print(f"Shape after one-hot encoding: {df_onehot.shape}")

Shape after one-hot encoding: (15420, 82)


6. Handle Missing Values

CatBoost handles missing values internally, but for one-hot & scaled models, we should fill missing values:

In [16]:
for col in df_onehot.columns:
    if col != 'Target':
        if df_onehot[col].dtype in ['int64', 'float64']:
            df_onehot[col].fillna(df_onehot[col].median(), inplace=True)
        else:
            df_onehot[col].fillna(df_onehot[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_onehot[col].fillna(df_onehot[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_onehot[col].fillna(df_onehot[col].mode()[0], inplace=True)


7. Train/Validation/Test Split

In [17]:
X = df_onehot.drop('Target', axis=1)
y = df_onehot['Target']

# 70/15/15 split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)


8. Scaling (for Logistic & MLP)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

9. SMOTE (for Logistic & MLP only)

In [19]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

10. Prepare CatBoost Dataset (raw features, no scaling, no SMOTE)

In [20]:
X_cb = df.drop('Target', axis=1)

X_train_cb, X_temp_cb, y_train_cb, y_temp_cb = train_test_split(
    X_cb, y, test_size=0.3, stratify=y, random_state=42
)
X_val_cb, X_test_cb, y_val_cb, y_test_cb = train_test_split(
    X_temp_cb, y_temp_cb, test_size=0.5, stratify=y_temp_cb, random_state=42
)

# Identify categorical columns
cat_features = X_train_cb.select_dtypes(include='object').columns.tolist()
cat_feature_indices = [X_train_cb.columns.get_loc(c) for c in cat_features]


11. Save your processed data

In [22]:
# Logistic & MLP
X_train_bal.to_csv('/content/drive/MyDrive/fraud detection/data/X_train_log.csv', index=False)
X_val_scaled.to_csv('/content/drive/MyDrive/fraud detection/data/X_val_log.csv', index=False)
X_test_scaled.to_csv('/content/drive/MyDrive/fraud detection/data/X_test_log.csv', index=False)
y_train_bal.to_csv('/content/drive/MyDrive/fraud detection/data/y_train_log.csv', index=False)
y_val.to_csv('/content/drive/MyDrive/fraud detection/data/y_val_log.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/fraud detection/data/y_test_log.csv', index=False)

# CatBoost
X_train_cb.to_csv('/content/drive/MyDrive/fraud detection/data/X_train_cb.csv', index=False)
X_val_cb.to_csv('/content/drive/MyDrive/fraud detection/data/X_val_cb.csv', index=False)
X_test_cb.to_csv('/content/drive/MyDrive/fraud detection/data/X_test_cb.csv', index=False)
y_train_cb.to_csv('/content/drive/MyDrive/fraud detection/data/y_train_cb.csv', index=False)
y_val_cb.to_csv('/content/drive/MyDrive/fraud detection/data/y_val_cb.csv', index=False)
y_test_cb.to_csv('/content/drive/MyDrive/fraud detection/data/y_test_cb.csv', index=False)

# Save CatBoost categorical features
joblib.dump(cat_features, '/content/drive/MyDrive/fraud detection/pkls/cat_features.pkl')
# Save the StandardScaler after fitting on training data
joblib.dump(scaler, '/content/drive/MyDrive/fraud detection/pkls/scaler.pkl')





['/content/drive/MyDrive/fraud detection/pkls/scaler.pkl']