In [52]:
import catboost as cb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

pd.set_option("mode.copy_on_write", True)

In [53]:
df = pd.read_csv("data/InsNova_data_2023_train.csv")
df.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,0.77,0.444504,SEDAN,4,M,D,3,petrol,147,...,S,1,weekday,6pm - 12am,6,640.448137,1.0,0,0,0.0
1,2,4.45,0.562183,STNWG,1,M,A,3,petrol,158,...,S,1,weekday,6am - 12pm,12,683.749691,0.0,0,0,0.0
2,3,4.9,0.465244,STNWG,1,F,A,3,petrol,159,...,M,1,weekday,6pm - 12am,6,653.656117,1.0,0,0,0.0
3,4,0.48,0.271039,PANVN,4,M,A,4,petrol,80,...,S,1,weekday,12pm - 6pm,12,642.574671,0.0,0,0,0.0
4,5,0.85,0.141624,SEDAN,4,F,A,5,petrol,126,...,S,0,weekday,6am - 12pm,6,647.175035,0.0,0,0,0.0


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22619 entries, 0 to 22618
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     22619 non-null  int64  
 1   veh_value              22619 non-null  float64
 2   exposure               22619 non-null  float64
 3   veh_body               22619 non-null  object 
 4   veh_age                22619 non-null  int64  
 5   gender                 22619 non-null  object 
 6   area                   22619 non-null  object 
 7   agecat                 22619 non-null  int64  
 8   engine_type            22619 non-null  object 
 9   max_power              22619 non-null  int64  
 10  driving_history_score  22619 non-null  float64
 11  veh_color              22619 non-null  object 
 12  marital_status         22619 non-null  object 
 13  e_bill                 22619 non-null  int64  
 14  time_of_week_driven    22619 non-null  object 
 15  ti

In [55]:
df["high_education_ind"] = df["high_education_ind"].astype("int64")
# df["high_education_ind"] = df["high_education_ind"].astype("category")

In [56]:
categorical_features = [
    # "veh_body",
    "veh_age",
    "gender",
    "area",
    "agecat",
    # "engine_type",
    # "veh_color",
    # "marital_status",
    "time_of_week_driven",
    # "time_driven",
    # "e_bill",
    "trm_len",
    "high_education_ind",
]

In [57]:
for col in categorical_features:
    df[col] = df[col].astype("category")

In [60]:
X = df.drop(
    columns=[
        "id",
        "clm",
        "numclaims",
        "claimcst0",
        "veh_body",
        "engine_type",
        "veh_color",
        "marital_status",
        "time_driven",
        "e_bill",
        "veh_value",
        "max_power",
    ]
)  # assuming 'claimcst0' is your target variable
print(X.columns)
y = df["claimcst0"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Index(['exposure', 'veh_age', 'gender', 'area', 'agecat',
       'driving_history_score', 'time_of_week_driven', 'trm_len',
       'credit_score', 'high_education_ind'],
      dtype='object')


In [8]:
import numpy as np
from sklearn.metrics import make_scorer


def modified_gini_scorer(estimator, X, y):
    # Assuming estimator.predict(X) returns the predicted claim costs
    y_pred = estimator.predict(X)

    # Calculate the ranks of predicted claim costs
    ranks = np.argsort(y_pred) + 1  # Adding 1 to start ranks from 1

    # Numerator part
    numerator = np.sum((2 * ranks - 1) * y)
    numerator = numerator - np.sum(y) * (np.sum(ranks) / len(ranks))

    # Denominator part
    denominator = np.sum(ranks * y)
    denominator = denominator - np.sum(y) * ((len(ranks) + 1) / 2)

    # Calculate Gini
    gini = numerator / denominator

    return gini


# Make a scorer with greater_is_better set to True since higher Gini values are better
# not needed just put here for reference later
gini_scorer = make_scorer(modified_gini_scorer, greater_is_better=True)

In [61]:
cat_model = cb.CatBoostRegressor(cat_features=categorical_features)

In [62]:
param_grid = {
    "depth": [3, 4, 5, 6],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "iterations": [500, 750, 1000, 1500],
    "loss_function": [
        "Tweedie:variance_power=1.1",
        "Tweedie:variance_power=1.2",
        "Tweedie:variance_power=1.3",
        "Tweedie:variance_power=1.4",
        "Tweedie:variance_power=1.5",
        "Tweedie:variance_power=1.6",
        "Tweedie:variance_power=1.7",
        "Tweedie:variance_power=1.8",
        "Tweedie:variance_power=1.9",
    ],
    "logging_level": ["Silent"],
}

In [63]:
# Using Grid Search
# Use neg mean absolute error as the metric for tweedie according to paper
grid_search = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
)

# Or using Random Search
# random_search = RandomizedSearchCV(estimator=cat_model, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=50, random_state=42)

# Fit the model
grid_search.fit(X_train, y_train)

In [65]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Evaluate the best model
best_model = grid_search.best_estimator_
best_model.score(X_test, y_test)  # You can use other metrics as needed

Best parameters: {'depth': 3, 'iterations': 500, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Tweedie:variance_power=1.2'}


0.0013199179604951317

In [67]:
x_train = df.drop(
    columns=[
        "id",
        "clm",
        "numclaims",
        "claimcst0",
        "veh_body",
        "engine_type",
        "veh_color",
        "marital_status",
        "time_driven",
        "e_bill",
        "veh_value",
        "max_power",
    ]
)
y_train = df["claimcst0"]

In [68]:
train_data = Pool(data=x_train, label=y_train, cat_features=categorical_features)

In [69]:
# Define the model parameters
# Best parameters: {'depth': 4, 'iterations': 1000, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Tweedie:variance_power=1.5'}
# Best parameters: {'depth': 3, 'iterations': 500, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Tweedie:variance_power=1.7'}
# Best parameters: {'depth': 6, 'iterations': 1500, 'learning_rate': 0.1, 'logging_level': 'Silent', 'loss_function': 'Tweedie:variance_power=1.8'}
# Best parameters: {'depth': 3, 'iterations': 1000, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Tweedie:variance_power=1.5'}
# For 10 features
# Best parameters: {'depth': 3, 'iterations': 500, 'learning_rate': 0.01, 'logging_level': 'Silent', 'loss_function': 'Tweedie:variance_power=1.2'}

model = CatBoostRegressor(
    iterations=500,
    depth=3,
    learning_rate=0.01,
    loss_function="Tweedie:variance_power=1.2",
)

# Train the regression model
model.fit(train_data)

0:	learn: 808.6693867	total: 4.52ms	remaining: 2.25s
1:	learn: 800.9380254	total: 8.16ms	remaining: 2.03s
2:	learn: 793.2954813	total: 14.7ms	remaining: 2.43s
3:	learn: 785.7410934	total: 21.7ms	remaining: 2.69s
4:	learn: 778.2753273	total: 23.5ms	remaining: 2.32s
5:	learn: 770.8957071	total: 29.2ms	remaining: 2.4s
6:	learn: 763.6042942	total: 31.6ms	remaining: 2.23s
7:	learn: 756.3986277	total: 33.8ms	remaining: 2.08s
8:	learn: 749.2790567	total: 37.1ms	remaining: 2.02s
9:	learn: 742.2468231	total: 41.1ms	remaining: 2.01s
10:	learn: 735.2995024	total: 43.2ms	remaining: 1.92s
11:	learn: 728.4356521	total: 45.2ms	remaining: 1.84s
12:	learn: 721.6592453	total: 47.1ms	remaining: 1.76s
13:	learn: 714.9677593	total: 48.3ms	remaining: 1.68s
14:	learn: 708.3580229	total: 50.8ms	remaining: 1.64s
15:	learn: 701.8347474	total: 52.6ms	remaining: 1.59s
16:	learn: 695.3945136	total: 54.6ms	remaining: 1.55s
17:	learn: 689.0377547	total: 56.6ms	remaining: 1.51s
18:	learn: 682.7649687	total: 58ms	rema

<catboost.core.CatBoostRegressor at 0x2a42b5450>

In [70]:
test = pd.read_csv("data/InsNova_data_2023_vh.csv")

In [71]:
test["high_education_ind"] = test["high_education_ind"].astype("int64")
test["high_education_ind"] = test["high_education_ind"].astype("category")

In [72]:
for col in categorical_features:
    test[col] = test[col].astype("category")

In [73]:
x_test = test.drop(
    columns=[
        "id",
        "veh_body",
        "engine_type",
        "veh_color",
        "marital_status",
        "time_driven",
        "e_bill",
        "veh_value",
        "max_power",
    ]
)

In [74]:
# Make predictions on the test set
y_pred = model.predict(x_test)
test["Predict"] = y_pred

In [75]:
submission = test[["id", "Predict"]]
submission

Unnamed: 0,id,Predict
0,1,122.173876
1,2,124.954848
2,3,110.837252
3,4,159.597829
4,5,132.010304
...,...,...
22615,22616,128.586131
22616,22617,99.423322
22617,22618,186.777583
22618,22619,199.123542


In [76]:
submission.to_csv("catboost_submission.csv", index=False)