In [1]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../data/InsNova_data_2023_train.csv")
df.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,0.77,0.444504,SEDAN,4,M,D,3,petrol,147,...,S,1,weekday,6pm - 12am,6,640.448137,1.0,0,0,0.0
1,2,4.45,0.562183,STNWG,1,M,A,3,petrol,158,...,S,1,weekday,6am - 12pm,12,683.749691,0.0,0,0,0.0
2,3,4.9,0.465244,STNWG,1,F,A,3,petrol,159,...,M,1,weekday,6pm - 12am,6,653.656117,1.0,0,0,0.0
3,4,0.48,0.271039,PANVN,4,M,A,4,petrol,80,...,S,1,weekday,12pm - 6pm,12,642.574671,0.0,0,0,0.0
4,5,0.85,0.141624,SEDAN,4,F,A,5,petrol,126,...,S,0,weekday,6am - 12pm,6,647.175035,0.0,0,0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22619 entries, 0 to 22618
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     22619 non-null  int64  
 1   veh_value              22619 non-null  float64
 2   exposure               22619 non-null  float64
 3   veh_body               22619 non-null  object 
 4   veh_age                22619 non-null  int64  
 5   gender                 22619 non-null  object 
 6   area                   22619 non-null  object 
 7   agecat                 22619 non-null  int64  
 8   engine_type            22619 non-null  object 
 9   max_power              22619 non-null  int64  
 10  driving_history_score  22619 non-null  float64
 11  veh_color              22619 non-null  object 
 12  marital_status         22619 non-null  object 
 13  e_bill                 22619 non-null  int64  
 14  time_of_week_driven    22619 non-null  object 
 15  ti

In [4]:
df["high_education_ind"] = df["high_education_ind"].astype("object")

In [5]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [6]:
x_train = df.drop(columns=["id", "clm", "numclaims", "claimcst0"])
y_train = df["claimcst0"]

In [7]:
sample_weights = np.where(y_train > 0, 1.2, 0.8)

In [8]:
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=4,
    n_estimators=100,
    max_depth=7,
    eta=0.1,
    subsample=0.7,
    colsample_bytree=1,
)  # For regression tasks

model.fit(x_train, y_train, sample_weight=sample_weights)

In [9]:
test = pd.read_csv("../data/InsNova_data_2023_vh.csv")

In [10]:
test["high_education_ind"] = test["high_education_ind"].astype("object")

In [11]:
# Identify categorical columns
categorical_cols = test.select_dtypes(include=["object"]).columns

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    test[col] = label_encoders[col].fit_transform(test[col])

In [12]:
x_test = test.drop(columns=["id"])

In [13]:
# Make predictions on the test set
y_pred = model.predict(x_test)
# Clip the predictions to be non-negative
y_pred = np.clip(y_pred, a_min=0, a_max=None)
test["Predict"] = y_pred

In [14]:
test["Predict"].describe()

count    22620.000000
mean       408.247223
std        564.968262
min          0.000000
25%         65.178841
50%        283.902496
75%        557.709869
max      16756.128906
Name: Predict, dtype: float64

In [15]:
submission = test[["id", "Predict"]]
submission

Unnamed: 0,id,Predict
0,1,98.139961
1,2,4668.424316
2,3,27.678736
3,4,1149.204712
4,5,159.811096
...,...,...
22615,22616,558.618347
22616,22617,0.000000
22617,22618,613.287415
22618,22619,343.162811


In [16]:
submission.to_csv("../output/xgboost_submission.csv", index=False)