In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
import dotenv
import os

dotenv.load_dotenv('.env')

credential = DefaultAzureCredential()
SUBSCRIPTION = os.getenv('AZURE_SUBSCRIPTION_ID')
RESOURCE_GROUP = os.getenv('AZURE_RESOURCE_GROUP')
WS_NAME = os.getenv('AZURE_WORKSPACE_NAME')

ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

In [3]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

ws = ml_client.workspaces.get(name="vince-dev")

mlflow.set_tracking_uri("azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/6c065ea7-65cd-4a34-8e2a-3e21ad4a8e9f/resourceGroups/vince-rg/providers/Microsoft.MachineLearningServices/workspaces/vince-dev")
mlflow.set_experiment(experiment_name="insurance-claims-classifier")

<Experiment: artifact_location='', creation_time=1711567033058, experiment_id='fe5998b9-d3c4-4f4c-a3f9-4f1de73b0f23', last_update_time=None, lifecycle_stage='active', name='insurance-claims-classifier', tags={}>

In [4]:
import pandas as pd 

df = pd.read_parquet(f'abfs://insurance-claims@vinceprojectdata.dfs.core.windows.net/policies.pq', engine="pyarrow", storage_options = {'account_key' : 'YOUR_STORAGE_ACCOUNT_KEY'}
)
df['timestamp'] = pd.to_datetime(df.policy_created_date)
df.sort_values(by='timestamp', inplace=True)

for i in [30,60,90]:
    df[f'model_claims_sum_{i}_continuous'] = df.groupby(['make','model'])[['is_claim','timestamp']]\
                                    .rolling(f'{i}D', on='timestamp',closed='neither')\
                                    .sum()[['is_claim','timestamp']]\
                                    .reset_index(level=['make','model'])['is_claim']

df.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim,policy_created_date,timestamp,model_claims_sum_30_continuous,model_claims_sum_60_continuous,model_claims_sum_90_continuous
6175,ID06176,0.389469,0.16,0.432692,C13,5410,1,B1,M8,CNG,...,No,Yes,Yes,2,0,2023-06-01 00:00:08,2023-06-01 00:00:08,,,
42484,ID42485,0.30606,0.0,0.576923,C1,4990,1,A,M1,CNG,...,No,No,Yes,0,0,2023-06-01 00:06:32,2023-06-01 00:06:32,,,
33174,ID33175,0.54001,0.17,0.326923,C6,13051,1,B1,M8,CNG,...,No,Yes,Yes,2,0,2023-06-01 00:10:42,2023-06-01 00:10:42,0.0,0.0,0.0
18860,ID18861,0.477429,0.01,0.5,C3,4076,1,A,M1,CNG,...,No,No,Yes,0,0,2023-06-01 00:11:13,2023-06-01 00:11:13,0.0,0.0,0.0
55887,ID55888,1.061667,0.04,0.451923,C2,27003,5,C1,M9,Diesel,...,Yes,Yes,Yes,4,0,2023-06-01 00:24:13,2023-06-01 00:24:13,,,


In [None]:
train_df = df.copy()
train_df.drop(['policy_id','policy_created_date'], axis=1, inplace=True)
train_df = train_df[train_df['timestamp']<pd.to_datetime('now')]
train_df.set_index('timestamp', inplace=True)
train_df.sort_index(inplace=True)

X = train_df.drop(labels=['is_claim'], axis=1)
y = train_df['is_claim']

test_size = 0.20

## Time aware cross validation split to preserve order
n_splits = (1//test_size)-1   # using // for integer division

tscv = TimeSeriesSplit(n_splits=int(n_splits))
for train_index, test_index in tscv.split(X):
    print(train_index, test_index)

X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
num_cols = X_train.select_dtypes(exclude=["object"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()

#mlflow.set_experiment("Vince Fraud model dev")
mlflow.sklearn.autolog()

run = mlflow.start_run()

num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N/A"), OneHotEncoder(handle_unknown="ignore", sparse=False)
)

full_pipe = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)])

pipeline = make_pipeline(full_pipe, 
                      GradientBoostingClassifier(n_estimators=100, 
                                                 learning_rate=0.1,
                                                 verbose=True))



pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)
mlflow.end_run()
