In [1]:
import catboost as cb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

pd.set_option("mode.copy_on_write", True)

In [3]:
df = pd.read_csv("InsNova_data_2023_train.csv")
df.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,0.77,0.444504,SEDAN,4,M,D,3,petrol,147,...,S,1,weekday,6pm - 12am,6,640.448137,1.0,0,0,0.0
1,2,4.45,0.562183,STNWG,1,M,A,3,petrol,158,...,S,1,weekday,6am - 12pm,12,683.749691,0.0,0,0,0.0
2,3,4.9,0.465244,STNWG,1,F,A,3,petrol,159,...,M,1,weekday,6pm - 12am,6,653.656117,1.0,0,0,0.0
3,4,0.48,0.271039,PANVN,4,M,A,4,petrol,80,...,S,1,weekday,12pm - 6pm,12,642.574671,0.0,0,0,0.0
4,5,0.85,0.141624,SEDAN,4,F,A,5,petrol,126,...,S,0,weekday,6am - 12pm,6,647.175035,0.0,0,0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22619 entries, 0 to 22618
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     22619 non-null  int64  
 1   veh_value              22619 non-null  float64
 2   exposure               22619 non-null  float64
 3   veh_body               22619 non-null  object 
 4   veh_age                22619 non-null  int64  
 5   gender                 22619 non-null  object 
 6   area                   22619 non-null  object 
 7   agecat                 22619 non-null  int64  
 8   engine_type            22619 non-null  object 
 9   max_power              22619 non-null  int64  
 10  driving_history_score  22619 non-null  float64
 11  veh_color              22619 non-null  object 
 12  marital_status         22619 non-null  object 
 13  e_bill                 22619 non-null  int64  
 14  time_of_week_driven    22619 non-null  object 
 15  ti

In [5]:
df["high_education_ind"] = df["high_education_ind"].astype("int64")
df["high_education_ind"] = df["high_education_ind"].astype("category")

In [6]:
categorical_features = [
    "veh_body",
    "veh_age",
    "gender",
    "area",
    "agecat",
    "engine_type",
    "veh_color",
    "marital_status",
    "time_of_week_driven",
    "time_driven",
    "e_bill",
    "trm_len",
    "high_education_ind",
]

In [8]:
for col in categorical_features:
    df[col] = df[col].astype("category")

In [9]:
X = df.drop(
    columns=["id", "clm", "numclaims", "claimcst0"]
)  # assuming 'claimcst0' is your target variable
y = df["claimcst0"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
cat_model = cb.CatBoostRegressor()

In [11]:
param_grid = {
    "depth": [6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "iterations": [500, 1000, 1500],
}

In [13]:
cat_features = [
    i for i, col in enumerate(X_train.columns) if X_train[col].dtype.name == "category"
]

In [14]:
# Using Grid Search
grid_search = GridSearchCV(
    estimator=cat_model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)

# Fit the model with categorical features
grid_search.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 1236.1983802	total: 68.5ms	remaining: 34.2s
1:	learn: 1236.1932218	total: 73.4ms	remaining: 18.3s
2:	learn: 1236.1877683	total: 76ms	remaining: 12.6s
3:	learn: 1235.7445658	total: 81.5ms	remaining: 10.1s
4:	learn: 1235.5728290	total: 86.5ms	remaining: 8.56s
5:	learn: 1235.5562271	total: 89.8ms	remaining: 7.4s
6:	learn: 1235.4314576	total: 95.5ms	remaining: 6.73s
7:	learn: 1234.9560431	total: 101ms	remaining: 6.24s
8:	learn: 1234.9313623	total: 108ms	remaining: 5.88s
9:	learn: 1234.9029887	total: 113ms	remaining: 5.56s
10:	learn: 1234.5248796	total: 119ms	remaining: 5.29s
11:	learn: 1234.5097746	total: 122ms	remaining: 4.95s
12:	learn: 1234.3082133	total: 127ms	remaining: 4.76s
13:	learn: 1234.2938150	total: 130ms	remaining: 4.51s
14:	learn: 1234.2521518	total: 135ms	remaining: 4.37s
15:	learn: 1234.1860680	total: 140ms	remaining: 4.22s
16:	learn: 1233.9401664	total: 146ms	remaining: 4.14s
17:	learn: 1233.9115477	total: 151ms	remaining: 4.03s
18:	learn: 1233.8922542	total: 156

In [16]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Evaluate the best model
best_model = grid_search.best_estimator_
best_model.score(X_test, y_test)  # You can use other metrics as needed

Best parameters: {'depth': 8, 'iterations': 500, 'learning_rate': 0.01}


-0.0004138138802951552

In [17]:
x_train = df.drop(columns=["id", "clm", "numclaims", "claimcst0"])
y_train = df["claimcst0"]

In [19]:
train_data = Pool(
    data=x_train,
    label=y_train,
    cat_features=[
        "veh_body",
        "veh_age",
        "gender",
        "area",
        "agecat",
        "engine_type",
        "veh_color",
        "marital_status",
        "time_of_week_driven",
        "time_driven",
        "e_bill",
        "trm_len",
        "high_education_ind",
    ],
)

In [20]:
# Define the model parameters
model = CatBoostRegressor(
    iterations=500, depth=8, learning_rate=0.01, loss_function="RMSE"
)

# Train the regression model
model.fit(train_data)

0:	learn: 1271.8542765	total: 33ms	remaining: 16.5s
1:	learn: 1271.8342007	total: 41.8ms	remaining: 10.4s
2:	learn: 1271.8310248	total: 48.1ms	remaining: 7.96s
3:	learn: 1271.8233981	total: 61.2ms	remaining: 7.59s
4:	learn: 1271.8225901	total: 64.6ms	remaining: 6.39s
5:	learn: 1271.3331412	total: 76.7ms	remaining: 6.31s
6:	learn: 1271.3289461	total: 81.5ms	remaining: 5.74s
7:	learn: 1271.3222846	total: 91.4ms	remaining: 5.62s
8:	learn: 1271.3038853	total: 94.8ms	remaining: 5.17s
9:	learn: 1271.2441404	total: 109ms	remaining: 5.33s
10:	learn: 1271.2267615	total: 113ms	remaining: 5.02s
11:	learn: 1271.1358310	total: 124ms	remaining: 5.05s
12:	learn: 1270.7501444	total: 135ms	remaining: 5.07s
13:	learn: 1270.4809759	total: 145ms	remaining: 5.04s
14:	learn: 1270.1136330	total: 158ms	remaining: 5.09s
15:	learn: 1270.0510728	total: 169ms	remaining: 5.1s
16:	learn: 1269.7210018	total: 179ms	remaining: 5.1s
17:	learn: 1269.7166658	total: 189ms	remaining: 5.06s
18:	learn: 1269.3157866	total: 20

<catboost.core.CatBoostRegressor at 0x285c0bc90>

In [21]:
test = pd.read_csv("InsNova_data_2023_vh.csv")

In [22]:
test["high_education_ind"] = test["high_education_ind"].astype("int64")
test["high_education_ind"] = test["high_education_ind"].astype("category")

In [23]:
for col in categorical_features:
    test[col] = test[col].astype("category")

In [24]:
x_test = test.drop(columns=["id"])

In [25]:
# Make predictions on the test set
y_pred = model.predict(x_test)
test["Predict"] = y_pred

In [26]:
submission = test[["id", "Predict"]]
submission

Unnamed: 0,id,Predict
0,1,121.526992
1,2,170.295868
2,3,111.874495
3,4,162.697085
4,5,119.353689
...,...,...
22615,22616,117.414445
22616,22617,90.853289
22617,22618,188.040682
22618,22619,186.632071


In [27]:
submission.to_csv("catboost_submission_optimal.csv", index=False)