In [4]:
import numpy as np
from sklearn.metrics import (
    precision_score, recall_score, f1_score, balanced_accuracy_score, roc_auc_score, 
    average_precision_score
)
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

In [5]:
class LogRegCCD:
    def __init__(self, lambda_min=1e-4, lambda_max=1.0, num_lambdas=100, alpha=1.0):
        self.lambda_min = lambda_min
        self.lambda_max = lambda_max
        self.num_lambdas = num_lambdas
        self.alpha = alpha
        self.coefficients = None
        self.lambdas = np.logspace(np.log10(lambda_max), np.log10(lambda_min), num_lambdas)
        self.best_lambda = None

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X_train, y_train, lmbda=None):
        if lmbda is None:
            lmbda = self.lambda_min

        n_samples, n_features = X_train.shape
        self.coefficients = np.zeros(n_features + 1)

        for _ in range(100):
            intercept = self.coefficients[0]
            weights = self.coefficients[1:]

            for j in range(n_features):
                partial_residual = y_train - self._sigmoid(np.dot(np.delete(X_train, j, axis=1), np.delete(weights, j)) + intercept)
                gradient = np.dot(X_train[:, j].T, partial_residual.T) / n_samples
                l1_penalty = self.alpha * lmbda
                l2_penalty = (1 - self.alpha) * lmbda
                soft_threshold = np.sign(gradient) * max(0.0, abs(gradient) - l1_penalty)
                weights[j] = soft_threshold.item() / (1 + l2_penalty)

            intercept += np.mean(y_train - self._sigmoid(np.dot(X_train, weights) + intercept))

            self.coefficients[0] = intercept
            self.coefficients[1:] = weights

    def validate(self, X_valid, y_valid, measure="f1"):
        probabilities = self.predict_proba(X_valid)
        predictions = (probabilities >= 0.5).astype(int)
        
        if measure == "precision":
            return precision_score(y_valid, predictions)
        elif measure == "recall":
            return recall_score(y_valid, predictions)
        elif measure == "f1":
            return f1_score(y_valid, predictions)
        elif measure == "balanced_accuracy":
            return balanced_accuracy_score(y_valid, predictions)
        elif measure == "roc_auc":
            return roc_auc_score(y_valid, probabilities)
        elif measure == "pr_auc":
            return average_precision_score(y_valid, probabilities)
        else:
            raise ValueError("Unsupported measure: {}".format(measure))

    def predict_proba(self, X_test):
        return np.asarray(self._sigmoid(np.dot(X_test, self.coefficients[1:]) + self.coefficients[0]).T).reshape(-1)

    def optimize_lambda(self, X_train, y_train, X_valid, y_valid, measure="f1"):
        best_score = -np.inf
        for lmbda in self.lambdas:
            print(f'Fitting lmbda: {lmbda}')
            self.fit(X_train, y_train, lmbda)
            score = self.validate(X_valid, y_valid, measure=measure)
            if score > best_score:
                best_score = score
                self.best_lambda = lmbda

        return self.best_lambda

In [6]:
df = pd.read_csv('./data/speech.csv')

In [7]:
X = np.asmatrix(df.drop(columns='target'))
y = np.asarray(df['target'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
ccd_model = LogRegCCD(num_lambdas=10)
ccd_model.optimize_lambda(X_train, y_train, X_test, y_test, measure="f1")
ccd_model.fit(X_train, y_train, ccd_model.best_lambda)

ccd_probs = ccd_model.predict_proba(X_test)
ccd_preds = (ccd_probs >= 0.5).astype(int)
ccd_f1 = f1_score(y_test, ccd_preds)
print(f"LogRegCCD F1 Score: {ccd_f1:.4f}")

Fitting lmbda: 1.0


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.35938136638046275


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.1291549665014884


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.046415888336127795


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.016681005372000592


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.005994842503189409


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.0021544346900318843


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.0007742636826811277


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.0002782559402207126


  return 1 / (1 + np.exp(-z))


Fitting lmbda: 0.0001


  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))


LogRegCCD F1 Score: 0.8448


In [10]:
logreg = LogisticRegression(max_iter=1000, penalty='elasticnet', solver='saga', l1_ratio=0.9)
logreg.fit(np.asarray(X_train), y_train)
sklearn_preds = logreg.predict(np.asarray(X_test))
sklearn_f1 = f1_score(y_test, sklearn_preds)



In [None]:
sklearn_f1

0.8324324324324325