In [1]:
!pip install catboost
!pip install scikit-learn

import catboost as cb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

pd.set_option("mode.copy_on_write", True)

In [2]:
df = pd.read_csv("../data/InsNova_data_2023_train.csv")
df.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,0.77,0.444504,SEDAN,4,M,D,3,petrol,147,...,S,1,weekday,6pm - 12am,6,640.448137,1.0,0,0,0.0
1,2,4.45,0.562183,STNWG,1,M,A,3,petrol,158,...,S,1,weekday,6am - 12pm,12,683.749691,0.0,0,0,0.0
2,3,4.9,0.465244,STNWG,1,F,A,3,petrol,159,...,M,1,weekday,6pm - 12am,6,653.656117,1.0,0,0,0.0
3,4,0.48,0.271039,PANVN,4,M,A,4,petrol,80,...,S,1,weekday,12pm - 6pm,12,642.574671,0.0,0,0,0.0
4,5,0.85,0.141624,SEDAN,4,F,A,5,petrol,126,...,S,0,weekday,6am - 12pm,6,647.175035,0.0,0,0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22619 entries, 0 to 22618
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     22619 non-null  int64  
 1   veh_value              22619 non-null  float64
 2   exposure               22619 non-null  float64
 3   veh_body               22619 non-null  object 
 4   veh_age                22619 non-null  int64  
 5   gender                 22619 non-null  object 
 6   area                   22619 non-null  object 
 7   agecat                 22619 non-null  int64  
 8   engine_type            22619 non-null  object 
 9   max_power              22619 non-null  int64  
 10  driving_history_score  22619 non-null  float64
 11  veh_color              22619 non-null  object 
 12  marital_status         22619 non-null  object 
 13  e_bill                 22619 non-null  int64  
 14  time_of_week_driven    22619 non-null  object 
 15  ti

In [4]:
df["high_education_ind"] = df["high_education_ind"].astype("int64")
df["high_education_ind"] = df["high_education_ind"].astype("category")

In [5]:
categorical_features = [
    "veh_body",
    "veh_age",
    "gender",
    "area",
    "agecat",
    "engine_type",
    "veh_color",
    "marital_status",
    "time_of_week_driven",
    "time_driven",
    "e_bill",
    "trm_len",
    "high_education_ind",
]

In [6]:
for col in categorical_features:
    df[col] = df[col].astype("category")

In [None]:
X = df.drop(
    columns=["id", "clm", "numclaims", "claimcst0"]
)  # assuming 'claimcst0' is your target variable
y = df["claimcst0"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
cat_model = cb.CatBoostRegressor()

In [None]:
param_grid = {
    "depth": [6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "iterations": [500, 1000, 1500],
}

In [None]:
# Using Grid Search
grid_search = GridSearchCV(
    estimator=cat_model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)

# Or using Random Search
# random_search = RandomizedSearchCV(estimator=cat_model, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=50, random_state=42)

# Fit the model
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Evaluate the best model
best_model = grid_search.best_estimator_
best_model.score(X_test, y_test)  # You can use other metrics as needed

In [7]:
x_train = df.drop(columns=["id", "clm", "numclaims", "claimcst0"])
y_train = df["claimcst0"]

In [8]:
train_data = Pool(
    data=x_train,
    label=y_train,
    cat_features=[
        "veh_body",
        "veh_age",
        "gender",
        "area",
        "agecat",
        "engine_type",
        "veh_color",
        "marital_status",
        "time_of_week_driven",
        "time_driven",
        "e_bill",
        "trm_len",
        "high_education_ind",
    ],
)

In [9]:
# Define the model parameters
model = CatBoostRegressor(
    iterations=100, depth=10, learning_rate=0.1, loss_function="RMSE"
)

# Train the regression model
model.fit(train_data)

0:	learn: 1270.4405793	total: 63.6ms	remaining: 6.3s
1:	learn: 1270.3172622	total: 67.3ms	remaining: 3.29s
2:	learn: 1268.8251059	total: 77.1ms	remaining: 2.49s
3:	learn: 1259.2156437	total: 86.2ms	remaining: 2.07s
4:	learn: 1254.4907981	total: 93.3ms	remaining: 1.77s
5:	learn: 1246.0982852	total: 100ms	remaining: 1.57s
6:	learn: 1241.0291153	total: 107ms	remaining: 1.42s
7:	learn: 1240.9475727	total: 110ms	remaining: 1.27s
8:	learn: 1238.8761207	total: 116ms	remaining: 1.18s
9:	learn: 1238.8479813	total: 119ms	remaining: 1.07s
10:	learn: 1238.7899304	total: 125ms	remaining: 1.01s
11:	learn: 1238.7350598	total: 132ms	remaining: 971ms
12:	learn: 1238.7278472	total: 137ms	remaining: 917ms
13:	learn: 1238.6997724	total: 140ms	remaining: 858ms
14:	learn: 1236.3011341	total: 147ms	remaining: 831ms
15:	learn: 1234.8186382	total: 153ms	remaining: 805ms
16:	learn: 1234.7859035	total: 158ms	remaining: 773ms
17:	learn: 1230.8012552	total: 163ms	remaining: 741ms
18:	learn: 1230.7836751	total: 166

<catboost.core.CatBoostRegressor at 0x13626cfd0>

In [10]:
test = pd.read_csv("../data/InsNova_data_2023_vh.csv")

In [11]:
test["high_education_ind"] = test["high_education_ind"].astype("int64")
test["high_education_ind"] = test["high_education_ind"].astype("category")

In [12]:
for col in categorical_features:
    test[col] = test[col].astype("category")

In [13]:
x_test = test.drop(columns=["id"])

In [20]:
# Make predictions on the test set
y_pred = model.predict(x_test)
test["Predict"] = y_pred

In [22]:
submission = test[["id", "Predict"]]
submission

Unnamed: 0,id,Predict
0,1,91.648187
1,2,139.974654
2,3,57.632904
3,4,212.747809
4,5,123.827861
...,...,...
22615,22616,117.358864
22616,22617,101.490830
22617,22618,180.502482
22618,22619,185.520659


In [23]:
submission.to_csv("../output/catboost_submission.csv", index=False)