In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

csv_path = "/data1/home/vedantb/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:

# Encode target variable
# Encode target variable
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])

# Drop customerID (not useful for modeling)
df = df.drop('customerID', axis=1)

# Identify all object (categorical) columns except the target
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'Churn' in categorical_cols:
    categorical_cols.remove('Churn')   # Target column

# One-hot encode all categorical columns
df = pd.get_dummies(df, columns=categorical_cols)

# Split features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [3]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [4]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [5]:
from sklearn.metrics import accuracy_score, roc_auc_score# ...existing code...
from sklearn.metrics import accuracy_score, roc_auc_score

# Logistic Regression predictions
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
y_train_proba_lr = lr.predict_proba(X_train)[:, 1]
y_test_proba_lr = lr.predict_proba(X_test)[:, 1]

# XGBoost predictions
y_train_pred_xgb = xgb.predict(X_train)
y_test_pred_xgb = xgb.predict(X_test)
y_train_proba_xgb = xgb.predict_proba(X_train)[:, 1]
y_test_proba_xgb = xgb.predict_proba(X_test)[:, 1]

# Logistic Regression scores
print("Logistic Regression:")
print("Train Accuracy:", accuracy_score(y_train, y_train_pred_lr))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred_lr))
print("Train AUC-ROC:", roc_auc_score(y_train, y_train_proba_lr))
print("Test AUC-ROC:", roc_auc_score(y_test, y_test_proba_lr))

# XGBoost scores
print("\nXGBoost Classifier:")
print("Train Accuracy:", accuracy_score(y_train, y_train_pred_xgb))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred_xgb))
print("Train AUC-ROC:", roc_auc_score(y_train, y_train_proba_xgb))
print("Test AUC-ROC:", roc_auc_score(y_test, y_test_proba_xgb))
# ...existing code...


Logistic Regression:
Train Accuracy: 0.8755768548100816
Test Accuracy: 0.8232789212207239
Train AUC-ROC: 0.930967274097125
Test AUC-ROC: 0.860862825675158

XGBoost Classifier:
Train Accuracy: 0.9359247426340078
Test Accuracy: 0.7863733144073811
Train AUC-ROC: 0.9836326062346377
Test AUC-ROC: 0.8273895783949403


In [6]:
import numpy as np

class LogisticRegressionScratch:
    def __init__(self, lr=0.01, n_iter=1000):
        self.lr = lr
        self.n_iter = n_iter

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        self.theta = np.zeros(X.shape[1])
        for _ in range(self.n_iter):
            z = np.dot(X, self.theta)
            h = self.sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient

    def predict_proba(self, X):
        X = np.c_[np.ones(X.shape[0]), X]
        return self.sigmoid(np.dot(X, self.theta))

    def predict(self, X):
        return self.predict_proba(X) >= 0.5


In [7]:
import numpy as np

class DecisionStump:
    def fit(self, X, y):
        m, n = X.shape
        self.feature = 0
        self.threshold = 0
        self.left_value = 0
        self.right_value = 0
        min_error = float('inf')
        for feature in range(n):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left = y[X[:, feature] <= threshold]
                right = y[X[:, feature] > threshold]
                left_pred = np.mean(left) if len(left) > 0 else 0
                right_pred = np.mean(right) if len(right) > 0 else 0
                preds = np.where(X[:, feature] <= threshold, left_pred, right_pred)
                error = np.mean((y - preds) ** 2)
                if error < min_error:
                    min_error = error
                    self.feature = feature
                    self.threshold = threshold
                    self.left_value = left_pred
                    self.right_value = right_pred

    def predict(self, X):
        return np.where(X[:, self.feature] <= self.threshold, self.left_value, self.right_value)

class SimpleGradientBoosting:
    def __init__(self, n_estimators=10, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.stumps = []

    def fit(self, X, y):
        y_pred = np.zeros_like(y, dtype=float)
        for _ in range(self.n_estimators):
            residual = y - y_pred
            stump = DecisionStump()
            stump.fit(X, residual)
            update = stump.predict(X)
            y_pred += self.learning_rate * update
            self.stumps.append(stump)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for stump in self.stumps:
            y_pred += self.learning_rate * stump.predict(X)
        return (y_pred > 0.5).astype(int)

    def predict_proba(self, X):
        y_pred = np.zeros(X.shape[0])
        for stump in self.stumps:
            y_pred += self.learning_rate * stump.predict(X)
        return y_pred

In [8]:
# ...existing code...

import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd

# Ensure correct dtype for scratch models
X_train_np = X_train.values.astype(float)
X_test_np = X_test.values.astype(float)
y_train_np = y_train.values.astype(float)
y_test_np = y_test.values.astype(float)

# Logistic Regression from scratch
lr_scratch = LogisticRegressionScratch(lr=0.01, n_iter=1000)
lr_scratch.fit(X_train_np, y_train_np)
y_train_pred_lr_s = lr_scratch.predict(X_train_np).astype(int)
y_test_pred_lr_s = lr_scratch.predict(X_test_np).astype(int)
y_train_proba_lr_s = lr_scratch.predict_proba(X_train_np)
y_test_proba_lr_s = lr_scratch.predict_proba(X_test_np)

# Simple Gradient Boosting from scratch
gb_scratch = SimpleGradientBoosting(n_estimators=10, learning_rate=0.1)
gb_scratch.fit(X_train_np, y_train_np)
y_train_pred_gb_s = gb_scratch.predict(X_train_np)
y_test_pred_gb_s = gb_scratch.predict(X_test_np)
y_train_proba_gb_s = gb_scratch.predict_proba(X_train_np)
y_test_proba_gb_s = gb_scratch.predict_proba(X_test_np)

# Collect results
results = [
    ["Model", "Train Accuracy", "Test Accuracy", "Train AUC-ROC", "Test AUC-ROC"],
    ["LogisticRegressionScratch",
     accuracy_score(y_train_np, y_train_pred_lr_s),
     accuracy_score(y_test_np, y_test_pred_lr_s),
     roc_auc_score(y_train_np, y_train_proba_lr_s),
     roc_auc_score(y_test_np, y_test_proba_lr_s)],
    ["SimpleGradientBoosting",
     accuracy_score(y_train_np, y_train_pred_gb_s),
     accuracy_score(y_test_np, y_test_pred_gb_s),
     roc_auc_score(y_train_np, y_train_proba_gb_s),
     roc_auc_score(y_test_np, y_test_proba_gb_s)]
]

# Print as table
df_results = pd.DataFrame(results[1:], columns=results[0])
print(df_results)

                       Model  Train Accuracy  Test Accuracy  Train AUC-ROC  \
0  LogisticRegressionScratch        0.562123       0.577005       0.810497   
1     SimpleGradientBoosting        0.734469       0.735273       0.823712   

   Test AUC-ROC  
0      0.839156  
1      0.838364  


In [9]:
# ...existing code...

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_base = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Best estimator
xgb_tuned = grid_search.best_estimator_

# Evaluate tuned model
y_train_pred_xgb_tuned = xgb_tuned.predict(X_train)
y_test_pred_xgb_tuned = xgb_tuned.predict(X_test)
y_train_proba_xgb_tuned = xgb_tuned.predict_proba(X_train)[:, 1]
y_test_proba_xgb_tuned = xgb_tuned.predict_proba(X_test)[:, 1]

# Update results table
results.append([
    "XGBoost (Tuned)",
    accuracy_score(y_train, y_train_pred_xgb_tuned),
    accuracy_score(y_test, y_test_pred_xgb_tuned),
    roc_auc_score(y_train, y_train_proba_xgb_tuned),
    roc_auc_score(y_test, y_test_proba_xgb_tuned)
])

# Print updated table
df_results = pd.DataFrame(results[1:], columns=results[0])
print(df_results)
# ...existing code...

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


                       Model  Train Accuracy  Test Accuracy  Train AUC-ROC  \
0  LogisticRegressionScratch        0.562123       0.577005       0.810497   
1     SimpleGradientBoosting        0.734469       0.735273       0.823712   
2            XGBoost (Tuned)        0.812034       0.814053       0.866327   

   Test AUC-ROC  
0      0.839156  
1      0.838364  
2      0.861701  
