In [1]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../data/InsNova_data_2023_train.csv")
df.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,0.77,0.444504,SEDAN,4,M,D,3,petrol,147,...,S,1,weekday,6pm - 12am,6,640.448137,1.0,0,0,0.0
1,2,4.45,0.562183,STNWG,1,M,A,3,petrol,158,...,S,1,weekday,6am - 12pm,12,683.749691,0.0,0,0,0.0
2,3,4.9,0.465244,STNWG,1,F,A,3,petrol,159,...,M,1,weekday,6pm - 12am,6,653.656117,1.0,0,0,0.0
3,4,0.48,0.271039,PANVN,4,M,A,4,petrol,80,...,S,1,weekday,12pm - 6pm,12,642.574671,0.0,0,0,0.0
4,5,0.85,0.141624,SEDAN,4,F,A,5,petrol,126,...,S,0,weekday,6am - 12pm,6,647.175035,0.0,0,0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22619 entries, 0 to 22618
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     22619 non-null  int64  
 1   veh_value              22619 non-null  float64
 2   exposure               22619 non-null  float64
 3   veh_body               22619 non-null  object 
 4   veh_age                22619 non-null  int64  
 5   gender                 22619 non-null  object 
 6   area                   22619 non-null  object 
 7   agecat                 22619 non-null  int64  
 8   engine_type            22619 non-null  object 
 9   max_power              22619 non-null  int64  
 10  driving_history_score  22619 non-null  float64
 11  veh_color              22619 non-null  object 
 12  marital_status         22619 non-null  object 
 13  e_bill                 22619 non-null  int64  
 14  time_of_week_driven    22619 non-null  object 
 15  ti

In [4]:
df["high_education_ind"] = df["high_education_ind"].astype("object")

In [5]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns

In [6]:
# Perform one-hot encoding for categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [7]:
# Separate predictors and target
X = df.drop(columns=["id", "clm", "numclaims", "claimcst0"], axis=1)
y = df["claimcst0"]

In [8]:
# Split the data into training and testing sets
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [10]:
model = linear_model.TweedieRegressor()

In [11]:
# Define the parameter grid for grid search
param_grid = {
    "power": [0, 0.5, 1, 1.25, 1.5],  # Adjust as needed
    "alpha": [0, 0.1, 0.25, 0.5, 0.75, 1],  # Adjust as needed
    "max_iter": [10000, 50000, 100000, 200000],
    "solver": ["lbfgs"],
}

In [12]:
# Create the grid search with cross-validation
grid_search = GridSearchCV(
    model, param_grid, cv=10, scoring="neg_mean_squared_error", n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and corresponding score
print("Best Parameters: ", grid_search.best_params_)
print("Best Negative Mean Squared Error: ", grid_search.best_score_)

Best Parameters:  {'alpha': 1, 'max_iter': 10000, 'power': 0, 'solver': 'lbfgs'}
Best Negative Mean Squared Error:  -1531811.0693868122


## Test Prediction

In [13]:
test = pd.read_csv("../data/InsNova_data_2023_vh.csv")

In [14]:
test["high_education_ind"] = test["high_education_ind"].astype("object")

In [15]:
# Identify categorical columns
categorical_cols = test.select_dtypes(include=["object"]).columns
# Perform one-hot encoding for categorical variables
test = pd.get_dummies(test, columns=categorical_cols, drop_first=True)
test = test.rename(columns={"high_education_ind_1": "high_education_ind_1.0"})
# Separate predictors and target
X_test = test.drop(columns=["id"], axis=1)

In [16]:
# Optionally, you can also evaluate the model on the test set
best_model = grid_search.best_estimator_
X_test_scaled = scaler.transform(X_test)
y_pred = best_model.predict(X_test_scaled)

In [17]:
test["Predict"] = y_pred

In [18]:
test["Predict"].describe()

count    22620.000000
mean       158.856702
std         39.470767
min         17.578132
25%        131.440008
50%        155.634871
75%        183.623278
max        332.692550
Name: Predict, dtype: float64

In [19]:
submission = test[["id", "Predict"]]
submission

Unnamed: 0,id,Predict
0,1,134.873897
1,2,137.541424
2,3,78.152419
3,4,190.906951
4,5,119.930010
...,...,...
22615,22616,186.860561
22616,22617,162.378363
22617,22618,197.915544
22618,22619,186.476750


In [20]:
submission.to_csv("../output/Tweedie_grid_submission.csv", index=False)