# Psuedo $R^2$ for logistic regression, no tears

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import binomial, normal
from scipy.stats import bernoulli, binom

np.random.seed(37)
sns.set(color_codes=True)

n = 10000
X = np.hstack([
    np.array([1 for _ in range(n)]).reshape(n, 1), 
    normal(0.0, 1.0, n).reshape(n, 1), 
    normal(0.0, 1.0, n).reshape(n, 1)
])
z = np.dot(X, np.array([1.0, 2.0, 3.0])) + normal(0.0, 1.0, n)
p = 1.0 / (1.0 + np.exp(-z))
y = binom.rvs(1, p)

In [2]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(fit_intercept=False)
lr.fit(X, y)

w = np.array(lr.coef_).transpose()
y_pred = lr.predict_proba(X)[:, 1]

print(lr.coef_)

[[0.89307796 1.71431569 2.59083718]]


$R^2 = 1 - \frac{\sum (y_i - \pi_i)^2}{\sum (y_i - \bar{y})^2}$

* $y_i$ is the i-th outcome label (e.g. 1 or 0)
* $\pi_i$ is the i-th predicted outcome probability
* $\bar{y}$ is the expected value of the observed outcomes $y = [y_1, \ldots, y_n]$

In [3]:
def efron_rsquare(y, y_pred):
    n = float(len(y))
    t1 = np.sum(np.power(y - y_pred, 2.0))
    t2 = np.sum(np.power((y - (np.sum(y) / n)), 2.0))
    return 1.0 - (t1 / t2)

In [4]:
efron_rsquare(y, y_pred)

0.5513984238650347

$R^2 = 1 - \frac{\ln \hat{L}_{full}}{\ln \hat{L}_{null}}$

* $\hat{L}_{full}$ is the estimated likelihood of the full model
* $\hat{L}_{null}$ is the estimated likelihood of the null model (model with only intercept)

In [7]:
def log_likelihood(w, X, y):
    score = np.dot(X, w).reshape(1, X.shape[0])
    return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

def null_log_likelihood(w, X, y):
    z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
    score = np.dot(X, z).reshape(1, X.shape[0])
    return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

def mcfadden_rsquare(w, X, y):
    return 1.0 - (log_likelihood(w, X, y) / null_log_likelihood(w, X, y))

def mcfadden_adjusted_rsquare(w, X, y):
    k = float(X.shape[1])
    return 1.0 - ((log_likelihood(w, X, y) - k) / null_log_likelihood(w, X, y))

In [8]:
mcfadden_rsquare(w, X, y)

0.5173802601449244

$R^2 = 1 - \frac{\ln \hat{L}_{full} - K}{\ln \hat{L}_{null}}$

* $\hat{L}_{full}$ is the estimated likelihood of the full model
* $\hat{L}_{null}$ is the estimated likelihood of the null model (model with only intercept)
* $K$ is the number of parameters (e.g. number of covariates associated with non-zero coefficients)

In [9]:
mcfadden_adjusted_rsquare(w, X, y)

0.516956038921871

In [45]:
def get_num_correct(y, y_pred, t=0.5):
    y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
    return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])

def count_rsquare(y, y_pred, t=0.5):
    n = float(len(y))
    num_correct = get_num_correct(y, y_pred, t)
    return num_correct / n

In [46]:
count_rsquare(y, y_pred)

0.8469

In [49]:
def get_count_most_freq_outcome(y):
    num_0 = 0
    num_1 = 0
    for p in y:
        if p == 1.0:
            num_1 += 1
        else:
            num_0 += 1
    return float(max(num_0, num_1))

def count_adjusted_rsquare(y, y_pred, t=0.5):
    correct = get_num_correct(y, y_pred, t)
    total = float(len(y))
    n = get_count_most_freq_outcome(y)
    return (correct - n) / (total - n)

In [51]:
count_adjusted_rsquare(y, y_pred)

0.6243866535819431

# References

* [FAQ: What are pseudo r-squareds?](https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/)
* [Logistic Regression](http://www.stat.cmu.edu/~cshalizi/uADA/12/lectures/ch12.pdf)
* [Measures of fit for logistic regression](https://support.sas.com/resources/papers/proceedings14/1485-2014.pdf)
* [A comparison of logistic pseudo $R^2$ indices](http://www.glmj.org/archives/articles/Smith_v39n2.pdf)

# Take a Look!

Take a look at []().