In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import TweedieRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_tweedie_deviance
from feature_engine.imputation import CategoricalImputer

In [None]:
# pip install feature_engine

In [None]:
# Load data and fill missing values
train_data = pd.read_csv('InsNova_data_2023_train.csv')
train_data = CategoricalImputer().fit_transform(train_data)

validation_data = pd.read_csv('InsNova_data_2023_vh.csv')
validation_data = CategoricalImputer().fit_transform(validation_data)
# Preprocess data
# Assuming 'id' is not useful for prediction
X = train_data.drop(columns=['id', 'clm', 'numclaims', 'claimcst0'])
y = train_data['claimcst0']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# valication features
X_val = validation_data.drop(['id'], axis=1)

In [None]:
X_train

Unnamed: 0,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,driving_history_score,veh_color,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind
9259,1.99,0.365893,HDTOP,3,F,A,1,petrol,230,56.0,yellow,M,1,weekday,12pm - 6pm,12,651.763615,0.0
14571,1.89,0.707842,MIBUS,4,M,C,4,dissel,263,65.0,black,S,1,weekday,6pm - 12am,12,642.434692,0.0
10668,1.68,0.081307,STNWG,4,M,A,3,dissel,109,41.0,gray,S,0,weekend,6am - 12pm,6,643.159667,0.0
7830,1.51,0.233288,SEDAN,2,F,C,4,petrol,178,92.0,black,M,1,weekday,12pm - 6pm,12,643.703073,0.0
21582,1.87,0.605976,HDTOP,3,M,A,3,dissel,292,84.0,white,S,0,weekday,6am - 12pm,12,638.636816,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15258,1.26,0.959456,SEDAN,2,M,A,2,petrol,205,52.0,white,S,1,weekend,6pm - 12am,12,655.880393,1.0
7669,1.52,0.045799,HBACK,1,F,C,3,petrol,145,91.0,white,M,0,weekday,12pm - 6pm,12,646.145700,0.0
16711,0.72,0.073608,SEDAN,4,F,A,6,petrol,121,76.0,black,M,0,weekday,6am - 12pm,6,643.612708,0.0
2214,1.07,0.206132,HBACK,3,F,A,3,petrol,93,47.0,green,M,0,weekday,6am - 12pm,6,657.495078,0.0


In [None]:
# Define categorical and numerical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# Create preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

In [None]:
# Define the model pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', TweedieRegressor())])

# Define the scorer and parameter grid for GridSearchCV
neg_mtd_scorer = make_scorer(mean_tweedie_deviance, greater_is_better=False, power=1.5)
param_grid = {
    'regressor__power': [1.1, 1.5, 1.7, 1.9, 2.5, 3, 3.2],
    'regressor__alpha': [0.1, 0.5, 1.0, 2.0, 2.5, 3, 3.2],
}


# Initialize GridSearchCV with k-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=neg_mtd_scorer, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X, y)

# Output best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Prediction and Evaluation
y_pred_test = best_model.predict(X_test)

105 fits failed out of a total of 245.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_glm/glm.py", line 226, in fit
    raise ValueError(
ValueError: Some value(s) of y are out of the valid range of the loss 'HalfTweedieLoss'.

          nan          nan -80.46670542 -79.8893952  -79

Best parameters found:  {'regressor__alpha': 2.0, 'regressor__power': 1.7}
Best score found:  -79.33251169008668


In [None]:
# Define Gini coefficient calculation functions
def calculate_gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float64)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    total_losses = all[:, 0].sum()
    gini_sum = all[:, 0].cumsum().sum() / total_losses

    gini_sum -= (len(actual) + 1) / 2.
    return gini_sum / len(actual)

def normalized_gini(actual, pred):
    return calculate_gini(actual, pred) / calculate_gini(actual, actual)

# Calculate the Normalized Gini Coefficient for the test set
normalized_gini_index = normalized_gini(y_test, y_pred_test)
normalized_gini_index

0.20338129796969204

In [None]:
# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Combine the predictions with the id column from the validation set
submission_df = pd.DataFrame({
    'id': validation_data['id'],
    'Predict': y_val_pred
})

# Save the submission file
#submission_df.to_csv('submission_2.66.csv', index=False)

In [None]:
df = pd.read_csv('submission_4.csv')

In [None]:
df

Unnamed: 0,id,Predict
0,1,120.509966
1,2,105.700607
2,3,87.850537
3,4,154.766588
4,5,99.324863
...,...,...
22615,22616,157.760535
22616,22617,128.011533
22617,22618,213.621776
22618,22619,220.660448


In [None]:
submission_df

Unnamed: 0,id,Predict
0,1,120.776944
1,2,106.406531
2,3,86.878628
3,4,160.031663
4,5,99.634674
...,...,...
22615,22616,158.843466
22616,22617,128.839625
22617,22618,212.149970
22618,22619,217.620407


In [None]:
coefficients = best_model.named_steps['regressor'].coef_

features = list(best_model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()) + \
           list(best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out())
coefficients_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

coefficients_df['Absolute_Coefficient'] = coefficients_df['Coefficient'].abs()
coefficients_df = coefficients_df.sort_values(by='Absolute_Coefficient', ascending=False)

# Display the top features
print("Top Features:")
print(coefficients_df.head(10))

Top Features:
                        Feature  Coefficient  Absolute_Coefficient
1                      exposure     0.237239              0.237239
3                        agecat    -0.131486              0.131486
7                       trm_len    -0.076812              0.076812
5         driving_history_score     0.068459              0.068459
2                       veh_age     0.058822              0.058822
46  time_of_week_driven_weekday    -0.055931              0.055931
47  time_of_week_driven_weekend     0.055924              0.055924
25                       area_A    -0.055839              0.055839
30                       area_F     0.053011              0.053011
45             marital_status_S    -0.045158              0.045158


In [None]:
import plotly.express as px

# Display the top features
top_features = coefficients_df.head(10)

fig = px.bar(top_features, x='Coefficient', y='Feature', orientation='h', color='Coefficient',
             labels={'Coefficient': 'Coefficient Value', 'Feature': 'Feature'},
             title='Top Features and Coefficients',
             template='plotly_dark',  # Change the template to 'plotly_dark'
             color_continuous_scale='Blues')

fig.show()