In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,mean_squared_error,r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import pathlib


In [None]:
train=pd.read_csv("Z:\\smart_premium\\data\\raw\\train.csv")
test=pd.read_csv("Z:\\smart_premium\\data\\raw\\test.csv")
sample_submission=pd.read_csv("Z:\\smart_premium\\data\\raw\\sample_submission.csv")

In [None]:
train['Policy Start Date']=pd.to_datetime(train['Policy Start Date'])
train['Policy Start Year']=train['Policy Start Date'].dt.year
train['Policy Start Month']=train['Policy Start Date'].dt.month
train['Policy Start Day']=train['Policy Start Date'].dt.day
train['Day of week']=train['Policy Start Date'].dt.dayofweek
train=train.drop(columns=['Policy Start Date'])

In [None]:
train.head()

In [None]:
X=train.drop(columns=['id','Premium Amount','Customer Feedback'])
y=train['Premium Amount']

In [None]:
print(X.columns.tolist())

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=21)

In [None]:
print(X_train)

In [None]:
num_cols=X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols=X.select_dtypes(include=['object','category']).columns.tolist()

num_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

col_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor=ColumnTransformer(transformers=[
    ('num',num_pipeline,num_cols),
    ('cat',col_pipeline,cat_cols)
])

In [None]:
models={
    'Linear Regression':LinearRegression(),
    'Decision Tree':DecisionTreeRegressor(random_state=30),
    'Random Forest':RandomForestRegressor(random_state=30,n_estimators=100,n_jobs=-1),
    'XGBoost':XGBRegressor(random_state=30,n_estimators=100,learning_rate=0.1,max_depth=6,n_jobs=-1,verbosity=0)
}

In [None]:
# =========================
# 3️⃣ MLflow Setup
# =========================
tracking_path = pathlib.Path(r"Z:\smart_premium\mlruns").absolute()
mlflow.set_tracking_uri(f"file:///{tracking_path.as_posix()}")
mlflow.set_experiment("Insurance_Premium_Prediction")

# =========================
# 4️⃣ Local save path
# =========================
local_model_dir = r"Z:\smart_premium\models"
os.makedirs(local_model_dir, exist_ok=True)

# =========================
# 5️⃣ Best model tracking
# =========================
best_model_name = None
best_model = None
best_rmse = float('inf')

# =========================
# 6️⃣ Training loop
# =========================
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Define pipeline
        pipe = Pipeline(steps=[
            ('preprocessor', preprocessor),  # your ColumnTransformer
            ('model', model)
        ])
        
        # Fit on training data
        pipe.fit(X_train, y_train)
        
        # Evaluate on validation set
        y_pred = pipe.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_val, y_pred)
        
        # Log metrics & params to MLflow
        mlflow.log_param('model_name', model_name)
        mlflow.log_metric('mae', mae)
        mlflow.log_metric('rmse', rmse)
        mlflow.log_metric('r2', r2)
        
        # Optional: log model to MLflow
        input_example = X_train.iloc[:5]
        mlflow.sklearn.log_model(pipe, name='model', input_example=input_example)
        
        print(f"{model_name} --> RMSE: {rmse:.2f}, R²: {r2:.3f}, MAE: {mae:.2f}")
        
        # Track best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_name = model_name
            best_model = pipe

# =========================
# 7️⃣ Retrain best model on full dataset
# =========================
print(f"\n🎯 Best model: {best_model_name} with RMSE {best_rmse:.2f}")

# Use all training + validation data for final training
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])
best_model.fit(X_full, y_full)

# Save final best model
final_model_path = os.path.join(local_model_dir, "best_model.pkl")
joblib.dump(best_model, final_model_path)
print(f"✅ Final best model saved at: {final_model_path}")

# =========================
# 8️⃣ Prepare submission
# =========================
save_dir = r"Z:\smart_premium\data\processed"
os.makedirs(save_dir, exist_ok=True)

X_test = test.drop(['id'], axis=1)
test_preds = best_model.predict(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'Premium Amount': test_preds
})

submission_file = os.path.join(save_dir, "submission.csv")
submission.to_csv(submission_file, index=False)
print(f"✅ Submission file saved at: {submission_file}")


In [None]:
mlflow ui

In [None]:
print(pipe.feature_names_in_)