In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer

In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

In [3]:
def date_trans(df):
    df['Policy Start Date']= pd.to_datetime(df['Policy Start Date'])
    df['Year'] = df['Policy Start Date'].dt.year
    df['Day'] = df['Policy Start Date'].dt.day
    df['Month'] = df['Policy Start Date'].dt.month
    df['DayOfWeek'] = df['Policy Start Date'].dt.dayofweek
    df.drop('Policy Start Date' , axis =1, inplace = True)
    return df

In [4]:
train_df = date_trans(train_df)
test_df = date_trans(test_df)

In [5]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


In [6]:
for col in train_df.select_dtypes(include='object').columns:
    train_df[col] = train_df[col].astype('category')
for col in test_df.select_dtypes(include='object').columns:
    test_df[col] = test_df[col].astype('category')

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def log_transform(y):
    """
    Apply log transformation safely, handling zero and negative values
    """
    return np.log1p(y)

def inverse_log_transform(y_log):
    """
    Revert log transformation
    """
    return np.expm1(y_log)

def rmsle(y_true, y_pred):
    """
    Calculate Root Mean Squared Logarithmic Error
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Example dataset (replace with your dataset)
np.random.seed(42)
df = train_df

# Find the target column (assumes it contains 'Premium' or 'Amount')
target_column = [col for col in df.columns if 'premium' in col.lower() or 'amount' in col.lower()]
if not target_column:
    raise ValueError("Could not find target column. Please specify the column name for premium/amount.")
target_column = target_column[0]

# Identify column types
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove target variable from features
if target_column in numeric_features:
    numeric_features.remove(target_column)
if target_column in categorical_features:
    categorical_features.remove(target_column)

# Features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Log transform the target variable
y_log = log_transform(y)

# Stratify target by binning into discrete intervals
binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
y_binned = binner.fit_transform(y_log.values.reshape(-1, 1)).astype(int).ravel()

# Test set (replace with your actual test set)
X_test = test_df

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# LinearRegression model parameters (no hyperparameters to tune)
model = LinearRegression()

# Stratified K-Fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store predictions
oof_predictions_log = np.zeros(len(X))  # Out-of-fold predictions in log space
oof_predictions = np.zeros(len(X))  # Out-of-fold predictions in original space

# Prepare for cross-validation
test_preds_per_fold_log = np.zeros((len(X_test), n_splits))  # Store test predictions per fold in log space
test_preds_per_fold = np.zeros((len(X_test), n_splits))  # Store test predictions per fold in original space
fold_oof_results = []  # To store ID, target, and OOF predictions

# Cross-validation loop
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y_binned)):
    print(f"Fold {fold + 1}/{n_splits}")
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_log, y_valid_log = y_log.iloc[train_idx], y_log.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Model pipeline with preprocessing and regressor
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train_log)
    
    # Predictions in log space
    oof_predictions_log[valid_idx] = pipeline.predict(X_valid)
    
    # Convert log predictions back to original space
    oof_predictions[valid_idx] = inverse_log_transform(oof_predictions_log[valid_idx])

    fold_rmsle = rmsle(y.iloc[valid_idx], oof_predictions[valid_idx])
    print(f"Fold {fold + 1} RMSLE: {fold_rmsle:.4f}")
    
    # Store fold results with ID, target, and OOF predictions
    fold_result = pd.DataFrame({
        'ID': X.index[valid_idx],       # Assuming X has an index as IDs
        'Actual': y.iloc[valid_idx],    # Actual target values
        'OOF_Pred_LR': oof_predictions[valid_idx],    # OOF predictions
        'Fold': fold + 1               # Fold number
    })
    fold_oof_results.append(fold_result)
    
    # Test set predictions for this fold
    test_preds_per_fold_log[:, fold] = pipeline.predict(X_test)
    test_preds_per_fold[:, fold] = inverse_log_transform(test_preds_per_fold_log[:, fold])

# Combine OOF results
oof_results_df = pd.concat(fold_oof_results, axis=0, ignore_index=True)

# Average predictions on test data
final_test_predictions = test_preds_per_fold.mean(axis=1)

# Evaluate OOF predictions
oof_mse = mean_squared_error(y, oof_predictions)
oof_rmsle = np.sqrt(mean_squared_log_error(y, oof_predictions))

print(f"OOF Mean Squared Error: {oof_mse:.4f}")
print(f"OOF Root Mean Squared Log Error: {oof_rmsle:.4f}")

print("Final Test Predictions:", final_test_predictions)
print(oof_results_df.head())


sub = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
sub['Premium Amount'] = final_test_predictions
sub.to_csv('lrsubmission.csv', index=False)
sub.head()
oof_results_df.to_csv('oof_lr.csv',index = False)
sub.head()

Fold 1/5
Fold 1 RMSLE: 1.0891
Fold 2/5
Fold 2 RMSLE: 1.0878
Fold 3/5
Fold 3 RMSLE: 1.0882
Fold 4/5
Fold 4 RMSLE: 1.0885
Fold 5/5
Fold 5 RMSLE: 1.0884
OOF Mean Squared Error: 884219.2629
OOF Root Mean Squared Log Error: 1.0884
Final Test Predictions: [907.22219971 582.83524658 674.65823017 ... 707.11061863 751.87394019
 737.56303972]
   ID  Actual  OOF_Pred_LR  Fold
0   4  2022.0   667.492085     1
1  15   849.0   779.015390     1
2  21  2670.0   654.119606     1
3  30   641.0   789.055631     1
4  34  2152.0   649.752725     1


Unnamed: 0,id,Premium Amount
0,1200000,907.2222
1,1200001,582.835247
2,1200002,674.65823
3,1200003,643.867759
4,1200004,696.832252


In [10]:
# sub = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
# sub['Premium Amount'] = final_test_predictions
# sub.to_csv('lgsubmission.csv', index=False)
# sub.head()
# oof_results_df.to_csv('oof_lgbm.csv',index = False)
# sub.head()

sub = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
sub['Premium Amount'] = final_test_predictions
sub.to_csv('lrsubmission.csv', index=False)
sub.head()
oof_results_df.to_csv('oof_lr.csv',index = False)
sub.head()

Unnamed: 0,id,Premium Amount
0,1200000,907.2222
1,1200001,582.835247
2,1200002,674.65823
3,1200003,643.867759
4,1200004,696.832252


In [9]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_log_error
# from catboost import CatBoostRegressor
# import optuna

# # Log transform function
# def log_transform(y):
#     return np.log1p(y)  # log(1+y) to handle zero values

# # Inverse log transform function
# def inverse_log_transform(y_log):
#     return np.expm1(y_log)  # exp(y) - 1 to revert log1p

# # Prepare data
# X = train_data.drop('Premium Amount', axis=1)
# y = train_data['Premium Amount']

# # Log transform the target variable
# y_log = log_transform(y)

# # Assuming all columns except 'Premium Amount' are numerical; update this as needed.
# categorical_features = [i for i, col in enumerate(X.columns) if X[col].dtype == 'object']

# def objective(trial):
#     # Hyperparameter space to search
#     params = {
#         "iterations": trial.suggest_int("iterations", 100, 10000, step=100),
#         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
#         "depth": trial.suggest_int("depth", 3, 10),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
#         "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
#         "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
#         "task_type": "GPU",  # Use GPU for training
#         "devices": "0",  # Use first GPU (Kaggle provides a single GPU, so "0" is the correct option)
#         "eval_metric": "MSLE",  # Use MSLE instead of RMSLE
#         "loss_function": "RMSE",  # Retaining RMSE for training
#         "early_stopping_rounds": 50,
#         "cat_features": categorical_features
#     }
    
#     # Split data into train and validation sets
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)
    
#     # Initialize the model with the given parameters
#     model = CatBoostRegressor(**params)
    
#     # Fit the model on log-transformed target
#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_valid, y_valid)],
#         verbose=False
#     )
    
#     # Predict on the validation set and inverse transform
#     y_pred_log = model.predict(X_valid)
#     y_pred = inverse_log_transform(y_pred_log)
#     y_valid_original = inverse_log_transform(y_valid)
    
#     # Calculate RMSLE on original scale
#     score = np.sqrt(mean_squared_log_error(y_valid_original, y_pred))
    
#     return score

# # Optuna Study for Hyperparameter Tuning
# study = optuna.create_study(direction="minimize")  # We want to minimize RMSLE
# study.optimize(objective, n_trials=50)

# # Print the best parameters and score
# print("Best RMSLE:", study.best_value)
# print("Best Hyperparameters:", study.best_params)
