In [1]:
# 1. Load necessary libraries
# install.packages("dplyr") # For data manipulation
# install.packages("caTools") # For splitting the data into train and test sets
# install.packages("glmnet") # For logistic regression
# install.packages("reticulate") # For interacting with Python`

In [2]:
library(dplyr)
library(caTools)
library(glmnet)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Matrix

Loaded glmnet 4.1-7



In [3]:
library(reticulate)
use_python("/home/jovyan/.cache/pypoetry/virtualenvs/validmind-WFljCIXp-py3.10/bin/python")
reticulate::py_config()

python:         /home/jovyan/.cache/pypoetry/virtualenvs/validmind-WFljCIXp-py3.10/bin/python
libpython:      /opt/conda/lib/libpython3.10.so
pythonhome:     /home/jovyan/.cache/pypoetry/virtualenvs/validmind-WFljCIXp-py3.10:/home/jovyan/.cache/pypoetry/virtualenvs/validmind-WFljCIXp-py3.10
virtualenv:     /home/jovyan/.cache/pypoetry/virtualenvs/validmind-WFljCIXp-py3.10/bin/activate_this.py
version:        3.10.11 | packaged by conda-forge | (main, May 10 2023, 18:47:07) [GCC 11.3.0]
numpy:          /home/jovyan/.cache/pypoetry/virtualenvs/validmind-WFljCIXp-py3.10/lib/python3.10/site-packages/numpy
numpy_version:  1.22.3

NOTE: Python version was forced by use_python() function

In [4]:
# 2. Read the dataset
data <- read.csv('../datasets/bank_customer_churn.csv')

In [5]:
# 3. Handle categorical variables using one-hot encoding and remove unnecessary columns
data <- data %>% select(-RowNumber, -CustomerId, -Surname)
geo_dummies <- model.matrix(~Geography - 1, data=data)
gender_dummies <- model.matrix(~Gender - 1, data=data)
data <- data %>% select(-Geography, -Gender)
data <- cbind(data, geo_dummies, gender_dummies)

In [6]:
# 4. Split the dataset into training and testing sets
set.seed(123) # Setting seed for reproducibility
split <- sample.split(data$Exited, SplitRatio = 0.7)
train_data <- subset(data, split == TRUE)
test_data <- subset(data, split == FALSE)

In [7]:
# 5. Run a logistic regression model
model <- glm(Exited ~ ., family = binomial(link = 'logit'), data = train_data)

In [8]:
# 6. Evaluate the model
pred_probs <- predict(model, newdata = test_data, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)

“prediction from a rank-deficient fit may be misleading”


In [9]:
# Confusion Matrix
table(test_data$Exited, pred_classes)

   pred_classes
       0    1
  0 1860   55
  1  397   88

## Running documentation tests

Ensure you have run `pip install validmind` before the following steps.

In [10]:
vm <- import("validmind")

In [11]:
vm$init(
  api_host="http://192.168.1.76:3000/api/v1/tracking",
  api_key="e7841dd9cffc67f268addd3ee9cc58f2",
  api_secret="7a8ab498d183d7a4907e82fac21827f30082d7acbff3dcdfcbb25d6728784e61",
  project="clkvhtg6g0005q08h5h9uhtjl"
)

In [12]:
class_labels <- list("0" = "Did not exit", "1" = "Exited")

vm_dataset = vm$init_dataset(
    dataset=data,
    target_column="Exited",
    class_labels=class_labels
)

In [13]:
# data_validation_results = vm$run_test_suite("tabular_dataset", dataset=vm_dataset)

In [14]:
vm_train_ds = vm$init_dataset(
    dataset=train_data,
    target_column="Exited"
)

vm_test_ds = vm$init_dataset(
    dataset=test_data,
    target_column="Exited"
)

vm_model = vm$init_r_model(
    model,
    train_ds=vm_train_ds,
    test_ds=vm_test_ds,
)

In [15]:
# ['coefficients', 'residuals', 'fitted.values', 'effects', 'R', 'rank', 'qr', 'family', 'linear.predictors', 'deviance', 'aic', 'null.deviance', 'iter', 'weights', 'prior.weights', 'df.residual', 'df.null', 'y', 'converged', 'boundary', 'model', 'call', 'formula', 'terms', 'data', 'offset', 'control', 'method', 'contrasts', 'xlevels']

In [16]:
py_last_error()

NULL

In [17]:
model


Call:  glm(formula = Exited ~ ., family = binomial(link = "logit"), 
    data = train_data)

Coefficients:
     (Intercept)       CreditScore               Age            Tenure  
      -3.628e+00        -8.556e-04         7.173e-02        -1.675e-02  
         Balance     NumOfProducts         HasCrCard    IsActiveMember  
       1.712e-06        -1.089e-01        -4.091e-02        -1.052e+00  
 EstimatedSalary   GeographyFrance  GeographyGermany    GeographySpain  
       4.536e-07        -5.413e-02         7.317e-01                NA  
    GenderFemale        GenderMale  
       4.824e-01                NA  

Degrees of Freedom: 5599 Total (i.e. Null);  5588 Residual
Null Deviance:	    5635 
Residual Deviance: 4806 	AIC: 4830

In [18]:
model$coefficients

In [19]:
model$method

In [20]:
model$family


Family: binomial 
Link function: logit 


In [21]:
model$formula

Exited ~ .

In [22]:
model_validation_results = vm$run_test_suite("binary_classifier_model_validation", model=vm_model)

In [24]:
model_validation_results$results

[[1]]
[[1]][[1]]
TestPlanMetricResult(result_id="model_metadata", metric, figures)

[[1]][[2]]
TestPlanMetricResult(result_id="dataset_split", metric, figures)

[[1]][[3]]
TestPlanFailedResult(result_id="confusion_matrix")

[[1]][[4]]
TestPlanFailedResult(result_id="classifier_in_sample_performance")

[[1]][[5]]
TestPlanFailedResult(result_id="classifier_out_of_sample_performance")

[[1]][[6]]
TestPlanFailedResult(result_id="pr_curve")

[[1]][[7]]
TestPlanFailedResult(result_id="roc_curve")


[[2]]
[[2]][[1]]
TestPlanFailedResult(result_id="accuracy_score")

[[2]][[2]]
TestPlanFailedResult(result_id="f1_score")

[[2]][[3]]
TestPlanFailedResult(result_id="roc_auc_score")

[[2]][[4]]
TestPlanFailedResult(result_id="training_test_degradation")


[[3]]
[[3]][[1]]
TestPlanFailedResult(result_id="overfit_regions")

[[3]][[2]]
TestPlanFailedResult(result_id="weak_spots")

[[3]][[3]]
TestPlanFailedResult(result_id="robustness")

