### 1. Libraries and SageMaker/S3 variables

In [2]:
!pip install pdfkit scikit-learn==1.0.2
!pip install -U sagemaker

Collecting pdfkit
  Downloading pdfkit-1.0.0-py3-none-any.whl (12 kB)
Collecting scikit-learn==1.0.2
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0 (from scikit-learn==1.0.2)
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: pdfkit, threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.1
    Uninstalling sc

In [3]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
import json
import joblib
import argparse
# from sagemaker.xgboost.estimator import XGBoost
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns", 500)

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.parameters import ( 
 ParameterInteger, 
 ParameterString, 
 ParameterFloat)

from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.workflow.steps import TuningStep
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, SplineTransformer

In [4]:
# Set SageMaker and S3 client variables
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_client = boto3.client("s3", region_name=region)
sagemaker_role = sagemaker.get_execution_role()



### 2. Configuration

In [7]:
# Set read and write S3 buckets and locations
write_bucket = sagemaker_session.default_bucket()+"/frenchtpl/sktweedie-hptune"
read_bucket = sagemaker_session.default_bucket()+"/frenchtpl/sktweedie-hptune"

# data location
train_data_uri = f"s3://{read_bucket}/data/train.csv"
validation_data_uri = f"s3://{read_bucket}/data/validation.csv"

# model save location
output_path = f"s3://{write_bucket}/output"

In [8]:
# variables
yvar_wtd = 'ClaimAmount'
yvar = 'PurePremium' # target value devided by exposure
wvar = 'Exposure' # target value devided by exposure
kvar = 'IDpol' # key variable
xvars = ['Area','VehPower','VehAge','DrivAge','BonusMalus','VehBrand','VehGas','Density','Region']

In [9]:
## arguments
parser = argparse.ArgumentParser()

# seed
parser.add_argument("--seed", type=int, default=42)

# test size
parser.add_argument("--test_size", type=float, default=0.2)

# transformation for b-spline
parser.add_argument("--degree", type=int, default=2)
parser.add_argument("--n_knots", type=int, default=3)

args, unknown = parser.parse_known_args()

### 3. Data

In [10]:
def load_mtpl2(n_samples=None):
    """Fetch the French Motor Third-Party Liability Claims dataset.

    Parameters
    ----------
    n_samples: int, default=None
      number of samples to select (for faster run time). Full dataset has
      678013 samples.
    """
    # claim count data
    df_freq = fetch_openml(data_id=41214, as_frame=True).data
    df_freq["IDpol"] = df_freq["IDpol"].astype(int)
    df_freq.set_index("IDpol", inplace=True)

    # severity data
    df_sev = fetch_openml(data_id=41215, as_frame=True).data
    df_sev["IDpol"] = df_sev["IDpol"].astype(int)
    df_sev = df_sev.groupby("IDpol").sum() # sum ClaimAmount over identical IDs

    # merge claim count and severity
    df = df_freq.join(df_sev, how="left")
    df["ClaimAmount"].fillna(0, inplace=True)

    # Insurances companies are interested in modeling the Pure Premium, that is
    # the expected total claim amount per unit of exposure for each policyholder
    # in their portfolio:
    df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]

    # This can be indirectly approximated by a 2-step modeling: the product of the
    # Frequency times the average claim amount per claim:
    df["Frequency"] = df["ClaimNb"] / df["Exposure"]
    df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)

    # unquote string fields
    for column_name in df.columns[df.dtypes.values == object]:
        df[column_name] = df[column_name].str.strip("'")
    return df.iloc[:n_samples]

def preprocessing(args_, df_, xvars_):
    X = pd.DataFrame()

    for col in xvars_:
        # one-hot encoding
        if (pd.api.types.is_categorical_dtype(df_[col]) or df_[col].dtypes=='object'):
            a = pd.get_dummies(df_[col]).reset_index(drop=True)
            a.columns = [f"{col}_{x}" for x in a.columns]
            X = pd.concat([X, a], axis=1)   
            del a

        # others - log, minmax at 0-1, and b-spline
        else:
            log_transformer = FunctionTransformer(func=np.log)
            scaler = MinMaxScaler()
            spline = SplineTransformer(degree=args.degree, n_knots=args.n_knots)

            shift = -df_[col].min()+1
            a = df_[col].fillna(0)
    #         a = log_transformer.transform(a+shift)
            a = scaler.fit_transform(np.array(a).reshape(-1, 1))
            a = spline.fit_transform(a)
            a = pd.DataFrame(a, columns=[f"{col}_sp{str(i)}" for i in range(a.shape[1])])
            X = pd.concat([X, a], axis=1)
            del a
    return X

In [12]:
# load data
df = load_mtpl2()
df = df.reset_index()

# preprpcessing
data = preprocessing(args, df, xvars)
data[wvar] = df[wvar]
data[yvar] = df[yvar]

# data split
train, validation = train_test_split(data, test_size=args.test_size, random_state=args.seed)

# save data
train.to_csv(train_data_uri, index=False)
validation.to_csv(validation_data_uri, index=False)

### 4. Training Script

In [13]:
%%writefile sktweedie_train.py

import argparse
import os
import joblib
import json
import pandas as pd
import numpy as np
import math
# import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import TweedieRegressor
from sklearn.model_selection import GridSearchCV

def rmse(true, pred, weight):
    return math.sqrt(np.average((true - pred)** 2, axis=0, weights=weight))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--nfolds", type=int, default=3)
    parser.add_argument("--scoring", type=str, default='neg_root_mean_squared_error')
    parser.add_argument("--alpha", type=float, default=0.1)
    parser.add_argument("--power", type=float, default=1.5)
    
    # SageMaker specific arguments. Defaults are set in the environment variables
    # Location of input training data
    parser.add_argument("--train_data_dir", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    # Location of input validation data
    parser.add_argument("--validation_data_dir", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION"))
    # Location where trained model will be stored. Default set by SageMaker, /opt/ml/model
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    # Location where model artifacts will be stored. Default set by SageMaker, /opt/ml/output/data
    parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
    
    args = parser.parse_args()
    
    yvar = "PurePremium"
    wvar = 'Exposure'
    xvars = ['Area','VehPower','VehAge','DrivAge','BonusMalus','VehBrand','VehGas','Density','Region']

    data_train = pd.read_csv(f"{args.train_data_dir}/train.csv")
    X_train = data_train.drop([yvar, wvar], axis=1)
    y_train = data_train[yvar]
    w_train = data_train[wvar]
    
    data_validation = pd.read_csv(f"{args.validation_data_dir}/validation.csv")
    X_valid = data_validation.drop([yvar, wvar], axis=1)
    y_valid = data_validation[yvar]
    w_valid = data_validation[wvar]
    
    # set model and run
    params = {
        "alpha": [args.alpha],
        "power": [args.power]
    }
    model = TweedieRegressor(link='log', max_iter=500)
    grid = GridSearchCV(
        estimator = model,
        param_grid = params,
        cv = args.nfolds,
        n_jobs = -1,
        scoring = args.scoring,
    )
    grid.fit(X_train, y_train, sample_weight=w_train)
    # model.fit(X_train, y_train, sample_weight=w_train)
    
    # best model
    best_model = grid.best_estimator_
    
    pred_train = best_model.predict(X_train)
    pred_valid = best_model.predict(X_valid)
    
    rmse_train = rmse(y_train, pred_train, w_train)
    rmse_valid = rmse(y_valid, pred_valid, w_valid)
    
    cv_rmse = grid.best_score_*-1
    
    print(f"[0]#011train-rmse:{rmse_train:.2f}")
    print(f"[0]#011validation-rmse:{rmse_valid:.2f}")
    print(f"[0]#011cv-rmse:{cv_rmse:.2f}")

    metrics_data = {
                    "metrics": {
                                "train:rmse": {"value": rmse_train},
                                "validation:rmse": {"value": rmse_valid},
                                "cv:rmse": {"value": cv_rmse},
                                }
                   }
              
    # Save the evaluation metrics to the location specified by output_data_dir
    metrics_location = args.output_data_dir + "/metrics.json"
    
    # Save the model to the location specified by model_dir
    model_location = args.model_dir + "/sktweedie-model"

    with open(metrics_location, "w") as f:
        json.dump(metrics_data, f)

    with open(model_location, "wb") as f:
        joblib.dump(best_model, f)

Writing sktweedie_train.py


### 5. Setting up hyperparameter tuning

In [14]:
sklearn = SKLearn(
    entry_point="sktweedie_train.py",
    framework_version="1.2-1",
    instance_type="ml.m5.xlarge",
    instance_count=1,
    role=sagemaker_role,
    sagemaker_session=sagemaker_session,
    # hyperparameters=static_hyperparams,
    output_path=output_path,
    # code_location=estimator_output_uri,
    base_job_name="frenchtpl-sktweedie-hptune"
)



In [15]:
hyperparameter_ranges = {
    "alpha": ContinuousParameter(0, 1),
    "power": ContinuousParameter(1.5, 1.9)
}

objective_metric_name = "cv-rmse"
metric_definitions = [{'Name': 'cv-rmse',
                       'Regex': '.*\[[0-9]+\].*#011cv-rmse:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'}]

tuner = HyperparameterTuner(
    estimator = sklearn,
    objective_metric_name = objective_metric_name,
    metric_definitions = metric_definitions,
    hyperparameter_ranges = hyperparameter_ranges,
    base_tuning_job_name = "frenchtpl-sktweedie-hptune",
    max_jobs=5,
    max_parallel_jobs=2,
    objective_type = 'Minimize',
    strategy = "Random",
)

### 6. Running hyperparameter tuning

In [16]:
# Setting the input channels for tuning job
s3_input_train = TrainingInput(s3_data=train_data_uri, content_type="csv", s3_data_type="S3Prefix")
s3_input_validation = TrainingInput(s3_data=validation_data_uri, content_type="csv", s3_data_type="S3Prefix")

tuner.fit(inputs={"train": s3_input_train, "validation": s3_input_validation})
tuner.wait()

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


Using provided s3_resource
..................................................!
!


### 7. Result

In [17]:
# Summary of tuning results ordered in descending order of performance
df_tuner = sagemaker.HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.job_name).dataframe()
df_tuner = df_tuner[df_tuner["FinalObjectiveValue"]>-float('inf')].sort_values("FinalObjectiveValue", ascending=True)
df_tuner

Unnamed: 0,alpha,power,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
2,0.18173,1.592074,frenchtpl-sktweedie--230828-1544-003-81412f85,Completed,35817.988281,2023-08-28 15:46:55+00:00,2023-08-28 15:47:36+00:00,41.0
3,0.091258,1.693096,frenchtpl-sktweedie--230828-1544-002-e4a50ce3,Completed,35818.0,2023-08-28 15:45:14+00:00,2023-08-28 15:46:36+00:00,82.0
0,0.51141,1.792874,frenchtpl-sktweedie--230828-1544-005-b414db5e,Completed,35818.578125,2023-08-28 15:47:39+00:00,2023-08-28 15:48:20+00:00,41.0
4,0.372979,1.847642,frenchtpl-sktweedie--230828-1544-001-58bf4392,Completed,35818.578125,2023-08-28 15:45:11+00:00,2023-08-28 15:46:38+00:00,87.0
1,0.897196,1.763156,frenchtpl-sktweedie--230828-1544-004-50016e17,Completed,35818.621094,2023-08-28 15:46:56+00:00,2023-08-28 15:47:33+00:00,37.0


### ...END