<a href="https://www.kaggle.com/code/yrpcio/insurance-eda-and-cat-boost?scriptVersionId=226840811" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from scipy.stats import normaltest
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.impute import SimpleImputer
from scipy.stats import zscore
warnings.simplefilter(action = "ignore", category = RuntimeWarning)
from scipy.stats import skew
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
train_csv = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
test_csv = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')
sample_submission_csv = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')

**EDA**

In [None]:
train_csv.columns

In [None]:
# Checks for duplicate rows
duplicates=train_csv.duplicated()
train_csv[duplicates]
#no duplicates so go ahead

In [None]:
pd.set_option('display.max_columns', None)
train_csv.head(3)

In [None]:
train_csv.info()

**Data Types looks fine**

In [None]:
#Imputation
na_counts=train_csv.isna().sum()
na_counts

In [None]:
num_cols=train_csv.select_dtypes(include=[np.number])

# Calculate skewness for each column
skewness_results = num_cols.apply(lambda x: x.skew()).to_frame(name="Skewness")

# Classify Skewness Type
skewness_results["Skewness Type"] = skewness_results["Skewness"].apply(
    lambda x: "Symmetric (Normal)" if -0.5 <= x <= 0.5 else 
              "Moderate Skew" if -1 <= x < -0.5 or 0.5 < x <= 1 else 
              "Highly Skewed"
)

# Display results
print(skewness_results)

In [None]:
# Categorize columns based on skewness for outliers
normal=list(skewness_results[skewness_results['Skewness Type']=="Symmetric (Normal)"].index)
skewed=list(skewness_results[skewness_results['Skewness Type']!="Symmetric (Normal)"].index)

**Data has columns both skewed and Normal. Opting for Mean imputation where cols are normal and median imputation if skewed**

In [None]:
def impute_based_on_skewness(data):
    for col in num_cols.columns:
        if data[col].isnull().sum() > 0:  # Apply imputation only if there are missing values
            col_skewness = skew(data[col].dropna())  # Compute skewness ignoring NaNs
            
            # Normal Distribution (Mean Imputation)
            if -0.5 <= col_skewness <= 0.5:
                imputer = SimpleImputer(strategy="mean")
                data.loc[:, col] = imputer.fit_transform(data[[col]])                

            # Skewed Distribution (Median Imputation)
            else:
                imputer = SimpleImputer(strategy="median")
                data.loc[:, col] = imputer.fit_transform(data[[col]])  # Use 2D array
    return data
    

In [None]:
train_csv=pd.DataFrame(impute_based_on_skewness(train_csv.copy()))

In [None]:
# Impute missing categorical values with mode (most frequent value)
categorical_cols = train_csv.select_dtypes(include=["object"]).columns

# Apply imputation
imputer = SimpleImputer(strategy="most_frequent")
train_csv[categorical_cols] = imputer.fit_transform(train_csv[categorical_cols])

print("Categorical values imputed using most frequent strategy!")

In [None]:
#Imputation check
na_counts=train_csv.isna().sum()
na_counts

Metric	How to Use for Outliers?

mean	If much larger/smaller than median (50%), data is skewed, possible outliers.

std (Standard Deviation)	If very high, the column has high variability (possible extreme values).

min & max	If max is far from Q3 (75%) or min is far from Q1 (25%), extreme values exist.

25% (Q1) & 75% (Q3)	Use to compute IQR and check for values outside 1.5× IQR range.

50% (Median)	If very different from mean, the data is skewed (potential outliers).

In [None]:
pd.options.display.float_format = '{:.2f}'.format
train_csv.describe()

✔ IQR for skewed columns (better for non-normal data)

✔ Z-score for normal columns (better for normal distributions)

In [None]:

# Function to detect outliers using Z-score
def detect_outliers_zscore(data, threshold=3):
    outlier_summary = {}
    
    for col in data.columns:
        z_scores = np.abs(zscore(data[col].dropna()))  # Compute absolute Z-scores
        outlier_count = (z_scores > threshold).sum()  # Count values above threshold

        outlier_summary[col] = {
            "Total Outliers": outlier_count,
            "Percentage of Outliers": round((outlier_count / len(data)) * 100, 2)
        }

    return pd.DataFrame(outlier_summary).T

# Run the function
outlier_results_z = detect_outliers_zscore(train_csv[normal])

# Display results
print(outlier_results_z)


**No outliers in normal data**

In [None]:

# Function to detect outliers using IQR
def detect_outliers_iqr(data):
    outlier_summary = {}
    
    for col in data.columns:
        Q1 = data[col].quantile(0.25)  # 25th percentile
        Q3 = data[col].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1  # Interquartile range

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col]
        outlier_count = outliers.count()
        
        outlier_summary[col] = {
            "Total Outliers": outlier_count,
            "Percentage of Outliers": round((outlier_count / len(data)) * 100, 2)
        }

    return pd.DataFrame(outlier_summary).T

# Run the function
outlier_results = detect_outliers_iqr(train_csv[skewed])

# Display the result
print(outlier_results)


> Annual Income and premium amount are important cols . Risk of missing information is high if removed. so log tranform the 2 cols


In [None]:


# Apply log transformation only on skewed columns
skewed_cols = ["Annual Income", "Previous Claims","Premium Amount"]  # Modify based on data distribution

transformed_data = train_csv.copy()

for col in skewed_cols:
    if col in transformed_data.columns:  # Ensure the column exists
        transformed_data[col] = np.log1p(transformed_data[col])  # log1p avoids log(0) issues


train_csv=transformed_data.copy()

In [None]:
train_csv_=train_csv.copy()

In [None]:
train_csv['Policy Start Date']=pd.to_datetime(train_csv['Policy Start Date'])
train_csv['Year']=train_csv['Policy Start Date'].dt.year
train_csv['Day']=train_csv['Policy Start Date'].dt.day
train_csv['Month']=train_csv['Policy Start Date'].dt.month
train_csv.drop(columns=['id','Policy Start Date'],inplace=True)

In [None]:
X = train_csv.drop(columns=["Premium Amount"])  # Replace "target" with actual target column name
y = train_csv["Premium Amount"]

# Identify categorical columns (CatBoost expects string names or indices)
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Initialize CatBoost model
model = CatBoostRegressor(
    iterations=500,        # Number of boosting rounds
    learning_rate=0.05,    # Step size for learning
    depth=6,               # Tree depth
    cat_features=categorical_cols,  # Let CatBoost handle categorical data
    loss_function="RMSE",  # Root Mean Squared Error (good for regression)
    eval_metric="MAE",     # Mean Absolute Error for evaluation
    verbose=100
)
# Train model (CatBoost handles encoding internally)
model.fit(X, y)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = model.predict(X)

# Evaluation Metrics
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

In [None]:
mae = mean_absolute_error(y, y_pred)
regression_accuracy = 1 - (mae / np.mean(y))

In [None]:
regression_accuracy

In [None]:
submit = pd.read_csv("/kaggle/input/playground-series-s4e12/sample_submission.csv")
submit["Premium Amount"] = np.exp( y )-1
submit.to_csv("submission.csv",index=False)