In [1]:
# Quick hack to load local SDK code
import os

os.chdir(os.path.join(os.getcwd(), ".."))

In [2]:
# Load API key and secret from environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [26]:
import pandas as pd
import xgboost as xgb

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [4]:
# Initialize ValidMind SDK
import validmind as vm

# For test environment use api_host="https://api.test.vm.validmind.ai/api/v1/tracking"
vm.init(
    api_key="api_key",
    api_secret="api_secret",
    project="cl1jyvrz5000c09lg28qf0pkx"
)

True

In [5]:
df = pd.read_csv("notebooks/datasets/health_insurance_cross_sell.csv")

targets = vm.DatasetTargets(
    target_column="Response",
    class_labels={
        "0": "Not interested",
        "1": "Interested",
    }
)

vm.log_dataset(df, "training", analyze=True, targets=targets)

True

In [6]:
results = vm.run_dataset_tests(df, target_column="Response", dataset_type="training", send=True)

Running data quality tests for "training" dataset...



100%|██████████| 7/7 [00:04<00:00,  1.51it/s]



Test suite has completed.
Sending results to ValidMind...
Successfully logged test results for test: class_imbalance
Successfully logged test results for test: duplicates
Successfully logged test results for test: cardinality
Successfully logged test results for test: missing
Successfully logged test results for test: pearson_correlation
Successfully logged test results for test: skewness
Successfully logged test results for test: zeros

Summary of results:

Test                 Passed      # Passed    # Errors    % Passed
-------------------  --------  ----------  ----------  ----------
class_imbalance      False              0           1           0
duplicates           True               1           0         100
cardinality          True               5           0         100
missing              True              12           0         100
pearson_correlation  False              0           2           0
skewness             False              5           1     83.3333
zeros   

In [7]:
# Drop ID as it is the primary key of the data
drop_columns = ["id"]
df.drop(drop_columns, axis=1, inplace=True)

In [8]:
genders = {"Male": 0, "Female": 1}
df.replace({"Gender": genders}, inplace=True)

In [9]:
df = pd.concat([df, pd.get_dummies(df["Vehicle_Age"], prefix="Vehicle_Age")], axis=1)
df.drop("Vehicle_Age", axis=1, inplace=True)

In [11]:
df.rename({
    "Vehicle_Age_1-2 Year": "Vehicle_Age_1_2_Year",
    "Vehicle_Age_< 1 Year": "Vehicle_Age_Less_1_Year",
    "Vehicle_Age_> 2 Years": "Vehicle_Age_More_2_Years"
}, inplace=True, axis=1)

df

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Vehicle_Age_1_2_Year,Vehicle_Age_Less_1_Year,Vehicle_Age_More_2_Years
0,0,44,1,28.0,0,Yes,40454.0,26.0,217,1,0,0,1
1,0,76,1,3.0,0,No,33536.0,26.0,183,0,1,0,0
2,0,47,1,28.0,0,Yes,38294.0,26.0,27,1,0,0,1
3,0,21,1,11.0,1,No,28619.0,152.0,203,0,0,1,0
4,1,29,1,41.0,1,No,27496.0,152.0,39,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
381104,0,74,1,26.0,1,No,30170.0,26.0,88,0,1,0,0
381105,0,30,1,37.0,1,No,40016.0,152.0,131,0,0,1,0
381106,0,21,1,30.0,1,No,35118.0,160.0,161,0,0,1,0
381107,1,68,1,14.0,0,Yes,44617.0,124.0,74,0,0,0,1


In [12]:
damages = {"No": 0, "Yes": 1}
df.replace({"Vehicle_Damage": damages}, inplace=True)

In [14]:
df = pd.concat([df, pd.get_dummies(df["Region_Code"], prefix="Region_Code")], axis=1)
df.drop("Region_Code", axis=1, inplace=True)

In [16]:
policy_channels = df["Policy_Sales_Channel"].value_counts().sort_values(ascending=True)
print(len(policy_channels))
print(policy_channels.sum())

print(len(policy_channels[policy_channels < 1000]))
print(policy_channels[policy_channels >= 1000].sum())

155
381109
134
365396


In [17]:
filtered_policy_channels = policy_channels[policy_channels >= 1000]
df = df[df["Policy_Sales_Channel"].isin(filtered_policy_channels.index)]
len(df)

365396

In [18]:
df = pd.concat([df, pd.get_dummies(df["Policy_Sales_Channel"], prefix="Policy_Sales_Channel")], axis=1)
df.drop("Policy_Sales_Channel", axis=1, inplace=True)

In [19]:
print(len(df[df["Age"] < 69]))
df["Age"].quantile(0.95)

347060


69.0

In [20]:
print(df["Vintage"].quantile(0.95))

285.0


In [37]:
text = df.head(10).to_markdown(index=False)

In [38]:
# Save text to a file
with open("notebooks/datasets/table", "w") as f:
    f.write(text)


In [21]:
train_ds, val_ds = train_test_split(df, test_size=0.20)

x_train = train_ds.drop("Response", axis=1)
x_val = val_ds.drop("Response", axis=1)
y_train = train_ds.loc[:, "Response"].astype(int)
y_val = val_ds.loc[:, "Response"].astype(int)

In [22]:
xgb_model = xgb.XGBClassifier(early_stopping_rounds=10)
xgb_model.fit(
    x_train,
    y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_metric=["error", "logloss", "auc"],
    verbose=False,
)



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=10, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [23]:
y_pred = xgb_model.predict_proba(x_val)[:, -1]
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_val, predictions)

print(f"Accuracy: {accuracy}")

Accuracy: 0.8772577996715928


In [24]:
vm.log_model(xgb_model)

True

In [25]:
vm.log_training_metrics(xgb_model, x_train, y_train)

Successfully logged training metrics


True

In [29]:
precision_score(y_val, predictions)

0.5466666666666666

In [30]:
recall_score(y_val, predictions)

0.0045672273588058376