## Importing and Pre-processing Dataset

### 1. Initialising dataset and configurations

In [3166]:
import pandas as pd
import numpy as np
import re 
from ucimlrepo import fetch_ucirepo 
pd.set_option('display.max_columns', None)
import multiprocessing as mp

n_bootstrap = 9

# Dataset 1 
remote, data_filepath, target_label, index_col, has_index = False, "WA_Fn-UseC_-Telco-Customer-Churn.csv", "Churn", "customerID", True
missing_placeholders, dropna, drop_dup, scaler_type, corr_threshold, drop_cols = ["?", "-"], True, False, 'm', 0.25, []
alpha, max_iter, random_state, meta_alpha, meta_max_iter =  0.1, 1000, 7575, 0.02, 5000

# # Dataset 2
# # remote, data_filepath, target_label, index_col, has_index = True, "Adult", "income", "", False        # import from repository
# remote, data_filepath, target_label, index_col, has_index = False, "adult.csv", "income", "", False     # import from file
# missing_placeholders, dropna, drop_dup, scaler_type, corr_threshold, drop_cols = ["?", "-"], False, False, 's', 0.15, ['education']
# alpha, max_iter, random_state, meta_alpha, meta_max_iter  =  0.1, 1000, 175, 0.1, 5000

# # Dataset 3
# remote, data_filepath, target_label, index_col, has_index = False, "creditcard.csv", "Class", "", False
# missing_placeholders, dropna, drop_dup, scaler_type, corr_threshold, drop_cols = ["?", "-"], True, True, 's', 0.12, []
# alpha, max_iter, random_state, meta_alpha, meta_max_iter  =  0.1, 1000, 175, 0.7, 1000


### 2. Importing dataset from file or UCI repository

In [3167]:
if remote:
    dataframe = fetch_ucirepo(name=data_filepath).data.original
    print (fetch_ucirepo(name=data_filepath).metadata.data_url)
else:
    dataframe = pd.read_csv(filepath_or_buffer=data_filepath, sep=',')
    if has_index:
        dataframe.set_index(index_col, inplace = True)
print (dataframe.shape)
dataframe.head()

(7043, 20)


Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### 3. Data Cleaning

#### a) Stripping stray characters

In [3168]:
np.unique(dataframe[target_label])

array(['No', 'Yes'], dtype=object)

In [3169]:
if pd.api.types.is_string_dtype(dataframe[target_label]):
    dataframe[target_label] = dataframe[target_label].str.rstrip('.')
    print (np.unique(dataframe[target_label]))

['No' 'Yes']


#### b) Cleaning of duplicates

In [3170]:
if drop_dup:
    dataframe = dataframe.drop_duplicates()

#### c) Cleaning/imputation of missing values

In [3171]:

pattern = r'^\s*$|^(' + '|'.join(map(re.escape, missing_placeholders)) + r')$'
dataframe = dataframe.replace(pattern, np.nan, regex=True)

dataframe = dataframe.apply(pd.to_numeric, errors='ignore')

if dropna:
    dataframe = dataframe.dropna()
else:
    for col in dataframe.columns:
        modes = dataframe[col].mode()
        if len(modes) > 0:
            value = modes[0]
        else:
            value = dataframe[col].median()
        dataframe[col] = dataframe[col].fillna(value=value)

dataframe

  dataframe = dataframe.apply(pd.to_numeric, errors='ignore')


Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


#### d) Dropping redundant columns

In [3172]:
dataframe = dataframe.drop(drop_cols, axis=1)
dataframe

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


### 4. Extracting features and labels

In [3173]:
Features = dataframe.drop(target_label, axis=1)
Labels = pd.DataFrame(dataframe[target_label], columns=[target_label])

### 5. Binarisation, discretisation, encoding

#### a) Identifying numeric columns

In [3174]:
num_cols = Features.select_dtypes(include=['number']).columns.to_list()
num_cols

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

#### b) Label-encoding boolean (Yes/No) columns

In [3175]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

binary_cols = Features.columns[Features.nunique() <= 2]
for col in binary_cols:
    Features[col] = encoder.fit_transform(Features[col])
Features[binary_cols]

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7590-VHVEG,0,0,1,0,0,1
5575-GNVDE,1,0,0,0,1,0
3668-QPYBK,1,0,0,0,1,1
7795-CFOCW,1,0,0,0,0,0
9237-HQITU,0,0,0,0,1,1
...,...,...,...,...,...,...
6840-RESVB,1,0,1,1,1,1
2234-XADUH,0,0,1,1,1,1
4801-JZAZL,0,0,1,1,0,1
8361-LTMKD,1,1,1,0,1,1


In [3176]:
Labels[target_label] = encoder.fit_transform(Labels[target_label])
np.unique(Labels[target_label])

array([0, 1])

#### c) One-hot encoding of remaining non-numeric features

In [3177]:
Features = pd.get_dummies(Features)
Features

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
7590-VHVEG,0,0,1,0,1,0,1,29.85,29.85,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
5575-GNVDE,1,0,0,0,34,1,0,56.95,1889.50,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,True
3668-QPYBK,1,0,0,0,2,1,1,53.85,108.15,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True
7795-CFOCW,1,0,0,0,45,0,0,42.30,1840.75,False,True,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,False,True,False,True,False,False,False
9237-HQITU,0,0,0,0,2,1,1,70.70,151.65,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6840-RESVB,1,0,1,1,24,1,1,84.80,1990.50,False,False,True,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,True,False,False,False,False,True
2234-XADUH,0,0,1,1,72,1,1,103.20,7362.90,False,False,True,False,True,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,True,False,True,False,False,True,False,False
4801-JZAZL,0,0,1,1,11,0,1,29.60,346.45,False,True,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
8361-LTMKD,1,1,1,0,4,1,1,74.40,306.60,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True


### 6. Scaling numeric values

In [3178]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

if scaler_type == 'm':
    scaler = MinMaxScaler()
elif scaler_type == 's':
    scaler = StandardScaler()
Features[num_cols] = scaler.fit_transform(Features[num_cols])
Features

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
7590-VHVEG,0,0.0,1,0,0.000000,0,1,0.115423,0.001275,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
5575-GNVDE,1,0.0,0,0,0.464789,1,0,0.385075,0.215867,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,True
3668-QPYBK,1,0.0,0,0,0.014085,1,1,0.354229,0.010310,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True
7795-CFOCW,1,0.0,0,0,0.619718,0,0,0.239303,0.210241,False,True,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,False,True,False,True,False,False,False
9237-HQITU,0,0.0,0,0,0.014085,1,1,0.521891,0.015330,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6840-RESVB,1,0.0,1,1,0.323944,1,1,0.662189,0.227521,False,False,True,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,True,False,False,False,False,True
2234-XADUH,0,0.0,1,1,1.000000,1,1,0.845274,0.847461,False,False,True,False,True,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,True,False,True,False,False,True,False,False
4801-JZAZL,0,0.0,1,1,0.140845,0,1,0.112935,0.037809,False,True,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False
8361-LTMKD,1,1.0,1,0,0.042254,1,1,0.558706,0.033210,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True


## Feature Selection

Finding the columns most correlated with target

In [3179]:
correlations = Features.corrwith(Labels[target_label])
correlations = abs(correlations).sort_values(ascending=False)
correlations = correlations[correlations > corr_threshold]
correlations

Contract_Month-to-month           0.404565
tenure                            0.354049
OnlineSecurity_No                 0.342235
TechSupport_No                    0.336877
InternetService_Fiber optic       0.307463
Contract_Two year                 0.301552
PaymentMethod_Electronic check    0.301455
OnlineBackup_No                   0.267595
DeviceProtection_No               0.252056
dtype: float64

Dropping columns with low correlation

In [3180]:
Features = Features[correlations.keys().to_list()]

## Implementing Logistic Regression

### 1. Sigmoid function (hypothesis)
Sigmoid function that can be applied on arrays or matrices:

In [3181]:
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

### 2. Training update rule (gradient descent)

X is a matrix whose rows each represent the feature vector of a sample \
Y is a vector containing the label for each training sample per row of X\
W is a vector containing the weight (parameter) for each feature per column of X, and the bias corresponding to an augmented all-1 column of X

In [3182]:
def gradient_descent (m, W, X, Y, iterations, alpha):
    alpha0= alpha
    for t in range (iterations):
        # alpha = alpha0 / (1 + t * alpha0)
        h_W_X = sigmoid (np.matmul(X, W))
        D_J_W = 1/m * np.matmul(np.transpose(X), (Y - h_W_X))
        W = W + alpha * D_J_W
    return W

### 3. Prediction using predicted weights (parameters)

In [3183]:
def predict_proba_Y (X, W):
    Y_hat = sigmoid(np.matmul(X, W))
    return Y_hat

def predict_Y (X, W):
    Y = (predict_proba_Y(X, W) >= 0.5).astype(int)
    return Y

The above functions are combined into the class `PyLogReg` that can be used almost as a drop-in replacement for scikit-learn's LogisticRegression. The `fit()` and `predict()` methods have signatures similar to the `LogisticRegression` counterparts. They receive `X_df` (a pandas dataframe of the dataset's features) and `y_srs` (a pandas series of the training labels) as arguments.

In [3184]:
class PyLogReg:
    def __init__(self, alpha=1.0, max_iter=100):
        self.alpha = alpha
        self.max_iter = max_iter
        self.W = None

    def fit (self, X_df, y_srs):
        alpha = self.alpha 
        X = X_df.to_numpy(dtype=float)
        Y = np.transpose(y_srs.to_numpy(dtype=float)).reshape(-1, 1)
        n = X.shape[1] + 1
        m = X.shape[0]
        X = np.hstack ((np.ones((m, 1)), X))
        W = np.zeros((n,1))
        self.W = gradient_descent (m, W, X, Y, self.max_iter, self.alpha)

    def predict (self, X_df, name="predictions"):
        X = X_df.to_numpy(dtype=float)
        X = np.hstack ((np.ones((X.shape[0], 1)), X))
        Y = predict_Y (X, self.W)
        return pd.Series(Y.flatten(), name=name)
    
    def predict_proba (self, X_df, name="predictions_proba"):
        X = X_df.to_numpy(dtype=float)
        X = np.hstack ((np.ones((X.shape[0], 1)), X))
        Y_hat = predict_proba_Y (X, self.W)
        return pd.Series(Y_hat.flatten(), name=name)

## Splitting the dataset

In [3185]:
from sklearn.model_selection import train_test_split

X = Features
y = Labels[target_label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
X_train, X_vald, y_train, y_vald = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state*7+39)

## Training and testing on LR (`PyLogReg`)

In [3186]:
from sklearn.metrics import accuracy_score

clf = PyLogReg (alpha=alpha, max_iter=max_iter)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of PyLogReg classifier: {accuracy:.6f}")

test_result = X_test.copy()
test_result[target_label] = encoder.inverse_transform(y_pred)
test_result

Accuracy of PyLogReg classifier: 0.800995


Unnamed: 0_level_0,Contract_Month-to-month,tenure,OnlineSecurity_No,TechSupport_No,InternetService_Fiber optic,Contract_Two year,PaymentMethod_Electronic check,OnlineBackup_No,DeviceProtection_No,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9402-ROUMJ,False,0.929577,False,False,True,False,False,True,False,No
8380-MQINP,False,0.760563,False,False,False,True,False,False,False,No
1207-BLKSA,False,0.718310,False,False,False,True,False,False,False,No
7339-POGZN,True,0.042254,True,True,False,False,True,True,True,Yes
4884-LEVMQ,False,0.535211,False,False,False,True,False,False,False,No
...,...,...,...,...,...,...,...,...,...,...
5317-FLPJF,False,0.915493,False,False,False,True,False,True,False,No
4749-VFKVB,True,0.000000,True,True,False,False,False,True,True,No
6637-KYRCV,True,0.056338,True,True,False,False,True,False,True,Yes
6016-NXBNJ,False,0.478873,False,False,False,False,False,False,False,No


## Bagging of training set

In [3187]:
clf_bag, y_pred_bag, y_pred_proba_bag = [], [], []

for i in range(n_bootstrap):
    X_train_i = X_train.sample(frac=1, replace=True, random_state=random_state+i*3)
    y_train_i = y_train.sample(frac=1, replace=True, random_state=random_state+i*3)

    clf_i = PyLogReg (alpha=alpha, max_iter=max_iter)
    clf_i.fit(X_train_i, y_train_i)
    
    y_pred_i = clf_i.predict(X_test, f"y_{i}")
    y_pred_proba_i = clf_i.predict_proba(X_test, f"y_proba_{i}")

    clf_bag.append (clf_i)

    y_pred_bag.append (y_pred_i)
    y_pred_proba_bag.append (y_pred_proba_i)

## Ensembling methods

### 1. Voting ensemble

In [3188]:
y_vote_matrix = pd.DataFrame({s.name: s for s in y_pred_bag})
y_vote_matrix['y_vote'] = y_vote_matrix.mode(axis=1).iloc[:, 0]
y_vote = y_vote_matrix['y_vote'] 

y_vote_proba_matrix = pd.DataFrame({s.name: s for s in y_pred_proba_bag})
y_vote_proba_matrix['y_vote_proba'] = y_vote_proba_matrix.median(axis=1)
y_vote_proba = y_vote_proba_matrix['y_vote_proba'] 

y_vote_matrix

Unnamed: 0,y_0,y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_vote
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1402,0,0,0,0,0,0,0,0,0,0
1403,0,0,0,0,0,0,0,0,0,0
1404,1,0,0,0,1,1,0,0,1,0
1405,0,0,0,0,0,0,0,0,0,0


In [3189]:
accuracy = accuracy_score(y_test, y_vote)
print(f"Accuracy of majority of bagged PyLogReg classifiers: {accuracy:.6f}")

Accuracy of majority of bagged PyLogReg classifiers: 0.804549


### 2. Stacking ensemble (with validation)

At first the bagged LR predictors will be run on the validation set.

In [3190]:
y_vald_pred_bag = []

for i in range(n_bootstrap):
    y_vald_pred_i = clf_bag[i].predict (X_vald, f"y_vald_{i}")
    y_vald_pred_bag.append (y_vald_pred_i)

y_stack_matrix = pd.DataFrame({s.name: s for s in y_vald_pred_bag})
y_stack_matrix

Unnamed: 0,y_vald_0,y_vald_1,y_vald_2,y_vald_3,y_vald_4,y_vald_5,y_vald_6,y_vald_7,y_vald_8
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1120,0,0,0,0,0,0,0,0,0
1121,0,0,0,0,0,0,0,0,0
1122,0,0,0,0,0,0,0,0,0
1123,0,0,0,0,0,0,0,0,0


Predictions on the validation set will be added as feature columns to the validation feature set in order to make an augmented feature matrix for meta-classifier.

In [3191]:
X_stack_aug_vald = pd.concat([X_vald.reset_index(drop=True), y_stack_matrix.reset_index(drop=True)], axis=1)
X_stack_aug_vald

Unnamed: 0,Contract_Month-to-month,tenure,OnlineSecurity_No,TechSupport_No,InternetService_Fiber optic,Contract_Two year,PaymentMethod_Electronic check,OnlineBackup_No,DeviceProtection_No,y_vald_0,y_vald_1,y_vald_2,y_vald_3,y_vald_4,y_vald_5,y_vald_6,y_vald_7,y_vald_8
0,False,0.802817,False,False,False,False,False,True,True,0,0,0,0,0,0,0,0,0
1,False,0.746479,False,True,False,True,False,False,True,0,0,0,0,0,0,0,0,0
2,True,0.154930,False,True,True,False,True,True,True,1,1,1,1,1,1,1,1,1
3,False,0.746479,False,False,False,False,False,False,True,0,0,0,0,0,0,0,0,0
4,True,0.042254,False,False,True,False,False,False,True,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,True,0.295775,True,False,False,False,True,False,True,0,0,0,0,0,0,0,0,0
1121,False,1.000000,True,False,True,True,True,False,False,0,0,0,0,0,0,0,0,0
1122,False,0.985915,True,False,True,True,True,False,True,0,0,0,0,0,0,0,0,0
1123,False,0.352113,False,False,False,False,False,False,False,0,0,0,0,0,0,0,0,0


Meta-classifier (an instance of `PyLogReg`) will be trained on the augmented validation set.

In [3192]:
metaclf = PyLogReg (alpha=meta_alpha, max_iter=meta_max_iter)
metaclf.fit (X_stack_aug_vald, y_vald)

The original testing set will be augmented with base classifier (bagged LR) predictions

In [3193]:
X_stack_aug_test = pd.concat([X_test.reset_index(drop=True), y_vote_matrix.reset_index(drop=True)], axis=1)
X_stack_aug_test = X_stack_aug_test.drop(['y_vote'], axis=1)
X_stack_aug_test

Unnamed: 0,Contract_Month-to-month,tenure,OnlineSecurity_No,TechSupport_No,InternetService_Fiber optic,Contract_Two year,PaymentMethod_Electronic check,OnlineBackup_No,DeviceProtection_No,y_0,y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8
0,False,0.929577,False,False,True,False,False,True,False,0,0,0,0,0,0,0,0,0
1,False,0.760563,False,False,False,True,False,False,False,0,0,0,0,0,0,0,0,0
2,False,0.718310,False,False,False,True,False,False,False,0,0,0,0,0,0,0,0,0
3,True,0.042254,True,True,False,False,True,True,True,1,1,1,1,1,1,1,1,1
4,False,0.535211,False,False,False,True,False,False,False,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,False,0.915493,False,False,False,True,False,True,False,0,0,0,0,0,0,0,0,0
1403,True,0.000000,True,True,False,False,False,True,True,0,0,0,0,0,0,0,0,0
1404,True,0.056338,True,True,False,False,True,False,True,1,0,0,0,1,1,0,0,1
1405,False,0.478873,False,False,False,False,False,False,False,0,0,0,0,0,0,0,0,0


Lastly, the meta-classifier will be run on the augmented testing set.

In [3194]:
y_stack = metaclf.predict(X_stack_aug_test)
y_stack_proba = metaclf.predict_proba(X_stack_aug_test)

accuracy = accuracy_score(y_test, y_stack)
print(f"Accuracy of stacking with meta-classifier: {accuracy:.6f}")

Accuracy of stacking with meta-classifier: 0.805259


## Performance Evaluation

#### 1. Preparing performance metrics

In [3195]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score, f1_score, roc_auc_score, precision_recall_curve, auc

def specificity (y_test, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    return TN/(TN+FP)

def aupr (y_test, y_pred_proba):
    precision_y, recall_y, threshold = precision_recall_curve(y_test, y_pred_proba)
    return auc (recall_y, precision_y)

performance_metrics = {
    'Accuracy' : accuracy_score,
    'Sensitivity' : recall_score,
    'Specificity': specificity,
    'Precision': precision_score,
    'F1-score': f1_score,
    'AUROC' : roc_auc_score,
    'AUPR' : aupr,
}

need_proba = ['AUROC', 'AUPR']

def all_perf_metrics (y_test, y_pred, y_pred_proba):
    metric_vals = {}
    for k in performance_metrics:
        if k in need_proba:
            metric_vals[k] = performance_metrics[k](y_test, y_pred_proba)
        else:
            metric_vals[k] = performance_metrics[k](y_test, y_pred)
    return metric_vals

#### 2. Compiling metrics for average of bagged LR

In [3196]:
metric_results_comp = pd.DataFrame (columns=performance_metrics.keys(), index=pd.Index([], name='Method'))

for i in range(len(y_pred_bag)):
    metric_results_comp.loc[f'LR_{i}'] = all_perf_metrics (y_test, y_pred_bag[i], y_pred_proba_bag[i])
metric_results_comp.loc['LR'] = [f"{m:.6f} ± {s:.4f}" for m, s in zip(metric_results_comp.mean(axis=0), metric_results_comp.std(axis=0))]
metric_results_comp = metric_results_comp.loc[['LR']]


#### 3. Compiling metrics for ensembling methods

In [3197]:
metric_results_comp.loc['Voting ensemble'] = all_perf_metrics (y_test, y_vote, y_vote_proba)
metric_results_comp.loc['Stacking ensemble'] = all_perf_metrics (y_test, y_stack, y_stack_proba)

metric_results_comp

Unnamed: 0_level_0,Accuracy,Sensitivity,Specificity,Precision,F1-score,AUROC,AUPR
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR,0.801943 ± 0.0021,0.544959 ± 0.0102,0.892628 ± 0.0045,0.641830 ± 0.0069,0.589357 ± 0.0053,0.847542 ± 0.0017,0.660907 ± 0.0043
Voting ensemble,0.804549,0.544959,0.896154,0.649351,0.592593,0.848201,0.662903
Stacking ensemble,0.805259,0.544959,0.897115,0.651466,0.593472,0.848647,0.656948
