In [27]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [28]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
test_data  = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

In [29]:
train_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [30]:
# Add an indicator column
train_data['Dataset'] = 'train'
test_data['Dataset'] = 'test'

# Concatenate train and test
df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [31]:
test_data.isnull().sum()

id                           0
Age                      12489
Gender                       0
Annual Income            29860
Marital Status           12336
Number of Dependents     73130
Education Level              0
Occupation              239125
Health Score             49449
Location                     0
Policy Type                  0
Previous Claims         242802
Vehicle Age                  3
Credit Score             91451
Insurance Duration           2
Policy Start Date            0
Customer Feedback        52276
Smoking Status               0
Exercise Frequency           0
Property Type                0
Dataset                      0
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Age                   float64
 2   Gender                object 
 3   Annual Income         float64
 4   Marital Status        object 
 5   Number of Dependents  float64
 6   Education Level       object 
 7   Occupation            object 
 8   Health Score          float64
 9   Location              object 
 10  Policy Type           object 
 11  Previous Claims       float64
 12  Vehicle Age           float64
 13  Credit Score          float64
 14  Insurance Duration    float64
 15  Policy Start Date     object 
 16  Customer Feedback     object 
 17  Smoking Status        object 
 18  Exercise Frequency    object 
 19  Property Type         object 
 20  Premium Amount        float64
 21  Dataset               object 
dtypes: float64(9), int64(1), object(12)
memory

In [33]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Display results
print("Numerical Columns:")
print(numerical_columns)

print("\nCategorical Columns:")
print(categorical_columns)

Numerical Columns:
Index(['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
       'Premium Amount'],
      dtype='object')

Categorical Columns:
Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type', 'Policy Start Date', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type', 'Dataset'],
      dtype='object')


In [34]:
# Convert 'Policy Start Date' to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')

# # Extract meaningful features
# df['Policy Start Year'] = df['Policy Start Date'].dt.year
# df['Policy Start Month'] = df['Policy Start Date'].dt.month
# df['Policy Start Day'] = df['Policy Start Date'].dt.day
# df['Policy Start DayOfWeek'] = df['Policy Start Date'].dt.dayofweek
# df['Policy Start IsWeekend'] = df['Policy Start Date'].dt.dayofweek >= 5

# Extract features
df['year'] = df['Policy Start Date'].dt.year
df['month'] = df['Policy Start Date'].dt.month
df['day'] = df['Policy Start Date'].dt.day
# df['hour'] = df['Policy Start Date'].dt.hour
df['dayofweek'] = df['Policy Start Date'].dt.dayofweek


# Drop the original datetime column
df.drop('Policy Start Date', axis=1, inplace=True)

In [35]:
cat = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type','Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type']

In [36]:
# Convert categorical columns to strings
categorical_features = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
                        'Policy Type', 'Customer Feedback', 'year', 'month', 'day', 'dayofweek',
                        'Smoking Status', 'Exercise Frequency', 'Property Type']

for col in categorical_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)


In [37]:
for col in categorical_features:
    X[col] = X[col].fillna("nan").astype(str)
    X_test[col] = X_test[col].fillna("nan").astype(str)


In [38]:
print(X[categorical_features].dtypes)
print(X_test[categorical_features].dtypes)


Gender                object
Marital Status        object
Education Level       object
Occupation            object
Location              object
Policy Type           object
Customer Feedback     object
year                  object
month                 object
day                   object
dayofweek             object
Smoking Status        object
Exercise Frequency    object
Property Type         object
dtype: object
Gender                object
Marital Status        object
Education Level       object
Occupation            object
Location              object
Policy Type           object
Customer Feedback     object
year                  object
month                 object
day                   object
dayofweek             object
Smoking Status        object
Exercise Frequency    object
Property Type         object
dtype: object


In [39]:
# Convert categorical columns to integers (Label Encoding)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in cat:
    df[col] = label_encoder.fit_transform(df[col])  # Apply LabelEncoder to each column

In [40]:
from sklearn.model_selection import train_test_split

# Split concatenated data back into train and test
train_data = df[df['Dataset'] == 'train'].drop(columns=['Dataset', 'id'])
test_data = df[df['Dataset'] == 'test'].drop(columns=['Dataset', 'Premium Amount'])


# categorical_columns = train_data.select_dtypes(include=['object']).columns

# Example: Assume 'target_column' is your target variable
target_column = 'Premium Amount'  # Replace with the actual target column name
X = train_data.drop(columns=[target_column])  # Features
y = train_data[target_column]  # Target

# # Reshape y to be 2D (required by TabNet)
# y = y.values.reshape(-1, 1)


# Split the data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Log transform the target variables to stabilize variance
y_train = np.log1p(y_train)  # Log transform training target
y_test = np.log1p(y_test)      # Log transform validation target


# Display the shapes of the resulting splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (960000, 22)
X_test shape: (240000, 22)
y_train shape: (960000,)
y_test shape: (240000,)


In [41]:
# import pandas as pd
# from catboost import CatBoostRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error

# # Initialize the CatBoost model with GPU support
# model = CatBoostRegressor(
#     iterations=1000,
#     depth=8,
#     learning_rate=0.1,
#     loss_function='RMSE',
#     cat_features=['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
#                   'Policy Type', 'Customer Feedback', 'year', 'month', 'day', 'dayofweek',
#                   'Smoking Status', 'Exercise Frequency', 'Property Type'],
#     task_type='GPU',   # Enable GPU
#     devices='0'        # Use GPU device 0 (adjust if needed for multiple GPUs)
# )

# # Train the model
# model.fit(
#     X_train,
#     y_train,
#     eval_set=(X_test, y_test),
#     verbose=100
# )

# # Make predictions
# y_pred = model.predict(X_test)

# # Evaluate the model
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print(f"RMSE: {rmse}")


In [42]:
# import pandas as pd
# from catboost import CatBoostRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error

# # Initialize the CatBoost model with GPU support and early stopping
# model = CatBoostRegressor(
#     iterations=5000,  # Increased iterations to allow finer learning
#     depth=8,
#     learning_rate=0.01,  # Reduced learning rate for gradual learning
#     loss_function='RMSE',
#     cat_features=['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
#                   'Policy Type', 'Customer Feedback', 'year', 'month', 'day', 'dayofweek',
#                   'Smoking Status', 'Exercise Frequency', 'Property Type'],
#     task_type='GPU',   # Enable GPU
#     devices='0'        # Use GPU device 0 (adjust if needed for multiple GPUs)
# )

# # Train the model with early stopping
# model.fit(
#     X_train,
#     y_train,
#     eval_set=(X_test, y_test),
#     verbose=100,
#     early_stopping_rounds=50  # Stop if no improvement for 50 iterations
# )

# # Make predictions
# y_pred = model.predict(X_test)

# # Evaluate the model
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print(f"RMSE: {rmse}")


In [43]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Set random seed
np.random.seed(42)

# Assume train_df and test_df are your DataFrames
df = train_data

# Features and target
X = df.drop('Premium Amount', axis=1)
y = df['Premium Amount']

# Log transform the target
y_log = np.log1p(y)

# Stratify target by binning into discrete intervals
binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
y_binned_log = binner.fit_transform(y_log.values.reshape(-1, 1)).astype(int).ravel()

# Stratified K-Fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store predictions
oof_predictions = np.zeros(len(X))  # Out-of-fold predictions
test_predictions = np.zeros(len(X))  # Test predictions for each fold

# Test set
X_test = test_data
test_preds_per_fold = np.zeros((len(X_test), n_splits))  # Store test predictions per fold
fold_oof_results = []  # To store ID, target, and OOF predictions

# CatBoost Regressor parameters
model_params = {
    'iterations': 6400,
    'depth': 9,
    'learning_rate': 0.012732653837375818,
    'l2_leaf_reg': 4.075927894066206e-05,
    'bagging_temperature': 0.2272504557883846,
    'random_strength': 0.5498513441824493,
    'loss_function': 'RMSE',
    'cat_features': ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
                     'Policy Type', 'Customer Feedback', 'year', 'month', 'day', 'dayofweek',
                     'Smoking Status', 'Exercise Frequency', 'Property Type'],
    'task_type': 'GPU',
    'devices': '0'
}

# Cross-validation loop
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y_binned_log)):
    print(f"Fold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[valid_idx]
    
    # Model
    model = CatBoostRegressor(**model_params)
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        verbose=100,
        early_stopping_rounds=50
    )
    
    # Predictions (log transformed)
    log_oof_preds = model.predict(X_valid)
    
    # Revert log transformation
    oof_predictions[valid_idx] = np.expm1(log_oof_preds)

    # RMSLE for the fold
    fold_rmsle = np.sqrt(mean_squared_log_error(y.iloc[valid_idx], oof_predictions[valid_idx]))
    print(f"Fold {fold + 1} RMSLE: {fold_rmsle:.4f}")
    
    # Store fold results
    fold_result = pd.DataFrame({
        'ID': X.index[valid_idx],
        'Actual': y.iloc[valid_idx],
        'OOF_Pred_CatBoost': oof_predictions[valid_idx],
        'Fold': fold + 1
    })
    fold_oof_results.append(fold_result)
    
    # Test set predictions
    log_test_preds = model.predict(X_test)
    test_preds_per_fold[:, fold] = np.expm1(log_test_preds)

# Combine fold results
oof_results_df = pd.concat(fold_oof_results, axis=0, ignore_index=True)

# Average predictions on test data
final_test_predictions = test_preds_per_fold.mean(axis=1)

# Evaluate OOF predictions
oof_mse = mean_squared_error(y, oof_predictions)
oof_rmsle = np.sqrt(mean_squared_log_error(y, oof_predictions))

print(f"OOF Mean Squared Error: {oof_mse:.4f}")
print(f"OOF Root Mean Squared Log Error: {oof_rmsle:.4f}")

# Output predictions
print("Final Test Predictions:", final_test_predictions)
print(oof_results_df.head())


Fold 1/5
0:	learn: 1.0946168	test: 1.0954225	best: 1.0954225 (0)	total: 124ms	remaining: 13m 15s
100:	learn: 1.0544934	test: 1.0559162	best: 1.0559162 (100)	total: 12.2s	remaining: 12m 41s
200:	learn: 1.0489052	test: 1.0506382	best: 1.0506382 (200)	total: 24.4s	remaining: 12m 32s
300:	learn: 1.0474414	test: 1.0494945	best: 1.0494945 (300)	total: 36.4s	remaining: 12m 17s
400:	learn: 1.0466896	test: 1.0491481	best: 1.0491481 (400)	total: 48.3s	remaining: 12m 2s
500:	learn: 1.0460580	test: 1.0489250	best: 1.0489242 (497)	total: 59.9s	remaining: 11m 45s
600:	learn: 1.0454614	test: 1.0487352	best: 1.0487352 (600)	total: 1m 11s	remaining: 11m 29s
700:	learn: 1.0449359	test: 1.0486427	best: 1.0486427 (699)	total: 1m 22s	remaining: 11m 13s
800:	learn: 1.0444420	test: 1.0485778	best: 1.0485762 (795)	total: 1m 34s	remaining: 10m 58s
900:	learn: 1.0439505	test: 1.0485338	best: 1.0485338 (900)	total: 1m 45s	remaining: 10m 43s
1000:	learn: 1.0434111	test: 1.0484692	best: 1.0484692 (999)	total: 1m 5

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=1]=28.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [None]:
# import pandas as pd
# from catboost import CatBoostRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_squared_error

# # Initialize the CatBoost model with GPU support and best hyperparameters from tuning
# model = CatBoostRegressor(
#     iterations=6400,  # Best iteration count from tuning
#     depth=9,  # Best depth from tuning
#     learning_rate=0.012732653837375818,  # Best learning rate from tuning
#     l2_leaf_reg=4.075927894066206e-05,  # Best l2_leaf_reg from tuning
#     bagging_temperature=0.2272504557883846,  # Best bagging_temperature from tuning
#     random_strength=0.5498513441824493,  # Best random_strength from tuning
#     loss_function='RMSE',
#     cat_features=['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
#                   'Policy Type', 'Customer Feedback', 'year', 'month', 'day', 'dayofweek',
#                   'Smoking Status', 'Exercise Frequency', 'Property Type'],
#     task_type='GPU',   # Enable GPU
#     devices='0'        # Use GPU device 0 (adjust if needed for multiple GPUs)
# )

# # K-fold cross-validation setup
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # Store RMSE for each fold
# rmse_scores = []

# for train_index, val_index in kf.split(X_train):
#     X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
#     y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
#     # Train the model on the current fold
#     model.fit(
#         X_train_fold,
#         y_train_fold,
#         eval_set=(X_val_fold, y_val_fold),
#         verbose=100,
#         early_stopping_rounds=50  # Stop if no improvement for 50 iterations
#     )
    
#     # Make predictions on the validation set
#     y_pred = model.predict(X_val_fold)
    
#     # Calculate RMSE for the current fold
#     rmse = mean_squared_error(y_val_fold, y_pred, squared=False)
#     rmse_scores.append(rmse)

# # Calculate average RMSE across all folds
# average_rmse = sum(rmse_scores) / len(rmse_scores)
# print(f"Average RMSE across all folds: {average_rmse}")


In [None]:
# Preprocess test data
test_features = test_data.drop(columns=['id'], errors='ignore')  # Drop unnecessary columns

In [None]:
# Predict using the trained CatBoost model
test_data['Premium Amount'] = model.predict(test_features)

In [None]:
# Reverse log transformation if applied during training
test_data['Premium Amount'] = np.expm1(test_data['Premium Amount'])  # Use np.expm1 if np.log1p was used during training

# Clip predictions to avoid negative values (optional)
test_data['Premium Amount'] = test_data['Premium Amount'].clip(lower=0)

In [None]:
# Create submission file
submission = test_data[['id', 'Premium Amount']]  # Include 'id' and the predicted target column
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")
print(submission.head())
oof_results_df.to_csv('OOF_Pred_cat',index = False)


# sub = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
# sub['Premium Amount'] = final_test_predictions
# sub.to_csv('lgsubmission.csv', index=False)
# sub.head()
# oof_results_df.to_csv('oof_lgbm.csv',index = False)

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_log_error
# from catboost import CatBoostRegressor
# import optuna

# # Log transform function
# def log_transform(y):
#     return np.log1p(y)  # log(1+y) to handle zero values

# # Inverse log transform function
# def inverse_log_transform(y_log):
#     return np.expm1(y_log)  # exp(y) - 1 to revert log1p

# # Prepare data
# X = train_data.drop('Premium Amount', axis=1)
# y = train_data['Premium Amount']

# # Log transform the target variable
# y_log = log_transform(y)

# # Assuming all columns except 'Premium Amount' are numerical; update this as needed.
# categorical_features = [i for i, col in enumerate(X.columns) if X[col].dtype == 'object']

# def objective(trial):
#     # Hyperparameter space to search
#     params = {
#         "iterations": trial.suggest_int("iterations", 100, 10000, step=100),
#         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
#         "depth": trial.suggest_int("depth", 3, 10),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
#         "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
#         "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
#         "task_type": "GPU",  # Use GPU for training
#         "devices": "0",  # Use first GPU (Kaggle provides a single GPU, so "0" is the correct option)
#         "eval_metric": "MSLE",  # Use MSLE instead of RMSLE
#         "loss_function": "RMSE",  # Retaining RMSE for training
#         "early_stopping_rounds": 50,
#         "cat_features": categorical_features
#     }
    
#     # Split data into train and validation sets
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)
    
#     # Initialize the model with the given parameters
#     model = CatBoostRegressor(**params)
    
#     # Fit the model on log-transformed target
#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_valid, y_valid)],
#         verbose=False
#     )
    
#     # Predict on the validation set and inverse transform
#     y_pred_log = model.predict(X_valid)
#     y_pred = inverse_log_transform(y_pred_log)
#     y_valid_original = inverse_log_transform(y_valid)
    
#     # Calculate RMSLE on original scale
#     score = np.sqrt(mean_squared_log_error(y_valid_original, y_pred))
    
#     return score

# # Optuna Study for Hyperparameter Tuning
# study = optuna.create_study(direction="minimize")  # We want to minimize RMSLE
# study.optimize(objective, n_trials=50)

# # Print the best parameters and score
# print("Best RMSLE:", study.best_value)
# print("Best Hyperparameters:", study.best_params)
