# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LogisticRegression

# Generate synthetic data
np.random.seed(0)
n_samples = 100
X = np.random.rand(n_samples, 3)  # 3 predictors
y = np.random.binomial(1, p=0.5, size=n_samples)  # Binary target variable

In [2]:
# Add a constant (intercept term) to the model
X_with_const = sm.add_constant(X)

# Fit the logistic regression model using statsmodels
model_sm = sm.Logit(y, X_with_const)
result_sm = model_sm.fit(disp=False)

In [9]:
# Extract the coefficients
coefficients_sm = result_sm.params

# Calculate the standard errors from the covariance matrix
cov_matrix_sm = result_sm.cov_params()
standard_errors_sm = np.sqrt(np.diag(cov_matrix_sm))

# Calculate Z-scores
z_scores_sm = coefficients_sm / standard_errors_sm

# Calculate p-values
p_values_sm = 2 * (1 - stats.norm.cdf(np.abs(z_scores_sm)))

# Create a DataFrame to store the results
results_df_sm = pd.DataFrame({
    'Coefficient': coefficients_sm,
    'Standard Error': standard_errors_sm,
    'Z-score': z_scores_sm,
    'P-value': p_values_sm
})

print("Calculated Results Using Statsmodels:")
display(results_df_sm)

Calculated Results Using Statsmodels:


Unnamed: 0,Coefficient,Standard Error,Z-score,P-value
0,-0.429624,0.63248,-0.679268,0.496968
1,0.003549,0.688721,0.005152,0.995889
2,0.608779,0.696332,0.874265,0.381974
3,-0.135473,0.730872,-0.185357,0.852949


In [10]:
# Compare with statsmodels summary output
summary_df_sm = result_sm.summary2().tables[1]
summary_df_sm = summary_df_sm[['Coef.', 'Std.Err.', 'z', 'P>|z|']]

# Rename columns for comparison
summary_df_sm.columns = ['Coefficient', 'Standard Error', 'Z-score', 'P-value']

print("Statsmodels Summary Results:")
display(summary_df_sm)

Statsmodels Summary Results:


Unnamed: 0,Coefficient,Standard Error,Z-score,P-value
const,-0.429624,0.63248,-0.679268,0.496968
x1,0.003549,0.688721,0.005152,0.995889
x2,0.608779,0.696332,0.874265,0.381974
x3,-0.135473,0.730872,-0.185357,0.852949


In [11]:
# Fit the logistic regression model using scikit-learn
model_sk = LogisticRegression(solver='lbfgs')
model_sk.fit(X, y)

In [12]:
# Extract the coefficients
coefficients_sk = np.concatenate([[model_sk.intercept_[0]], model_sk.coef_.flatten()])

# Add a constant (intercept term) to the design matrix
X_with_const_sk = np.hstack([np.ones((X.shape[0], 1)), X])

# Calculate the predicted probabilities
pred_probs_sk = model_sk.predict_proba(X)[:, 1]

# Calculate the diagonal matrix of weights
W = np.diag(pred_probs_sk * (1 - pred_probs_sk))

# Calculate the variance-covariance matrix
cov_matrix_sk = np.linalg.inv(X_with_const_sk.T @ W @ X_with_const_sk)

# Calculate the standard errors
standard_errors_sk = np.sqrt(np.diag(cov_matrix_sk))

# Calculate Z-scores
z_scores_sk = coefficients_sk / standard_errors_sk

# Calculate p-values
p_values_sk = 2 * (1 - stats.norm.cdf(np.abs(z_scores_sk)))

# Create a DataFrame to store the results
results_df_sk = pd.DataFrame({
    'Coefficient': coefficients_sk,
    'Standard Error': standard_errors_sk,
    'Z-score': z_scores_sk,
    'P-value': p_values_sk
})

print("Calculated Results Using Sklearn:")
display(results_df_sk)

Calculated Results Using Sklearn:


Unnamed: 0,Coefficient,Standard Error,Z-score,P-value
0,-0.350217,0.630858,-0.555143,0.578797
1,-5.5e-05,0.687176,-8.1e-05,0.999936
2,0.413576,0.693596,0.596278,0.55099
3,-0.10045,0.729564,-0.137685,0.890489


In [13]:
# Compare the two DataFrames
comparison_df = results_df_sm.join(results_df_sk, lsuffix='_sm', rsuffix='_sk')

print("Comparison of Results from Statsmodels and Sklearn:")
display(comparison_df)

Comparison of Results from Statsmodels and Sklearn:


Unnamed: 0,Coefficient_sm,Standard Error_sm,Z-score_sm,P-value_sm,Coefficient_sk,Standard Error_sk,Z-score_sk,P-value_sk
0,-0.429624,0.63248,-0.679268,0.496968,-0.350217,0.630858,-0.555143,0.578797
1,0.003549,0.688721,0.005152,0.995889,-5.5e-05,0.687176,-8.1e-05,0.999936
2,0.608779,0.696332,0.874265,0.381974,0.413576,0.693596,0.596278,0.55099
3,-0.135473,0.730872,-0.185357,0.852949,-0.10045,0.729564,-0.137685,0.890489


# Linear Regression

In [14]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LinearRegression

# Generate synthetic data
np.random.seed(0)
n_samples = 100
X = np.random.rand(n_samples, 3)  # 3 predictors
y = 3 + 2*X[:, 0] + -1*X[:, 1] + 0.5*X[:, 2] + np.random.randn(n_samples)  # Continuous target variable

In [15]:
# Add a constant (intercept term) to the model
X_with_const = sm.add_constant(X)

# Fit the linear regression model using statsmodels
model_sm = sm.OLS(y, X_with_const)
result_sm = model_sm.fit()

In [17]:
# Extract the coefficients
coefficients_sm = result_sm.params

# Calculate the standard errors from the covariance matrix
cov_matrix_sm = result_sm.cov_params()
standard_errors_sm = np.sqrt(np.diag(cov_matrix_sm))

# Calculate t-scores
t_scores_sm = coefficients_sm / standard_errors_sm

# Calculate p-values
p_values_sm = 2 * (1 - stats.t.cdf(np.abs(t_scores_sm), df=n_samples - X_with_const.shape[1]))

# Create a DataFrame to store the results
results_df_sm = pd.DataFrame({
    'Coefficient': coefficients_sm,
    'Standard Error': standard_errors_sm,
    'T-score': t_scores_sm,
    'P-value': p_values_sm
})

print("Calculated Results Using Statsmodels:")
display(results_df_sm)

Calculated Results Using Statsmodels:


Unnamed: 0,Coefficient,Standard Error,T-score,P-value
0,2.950214,0.297073,9.930943,2.220446e-16
1,1.591325,0.323915,4.912787,3.688347e-06
2,-1.020043,0.326421,-3.124935,0.002352465
3,0.753231,0.344119,2.188869,0.03103116


In [18]:
# Compare with statsmodels summary output
summary_df_sm = result_sm.summary2().tables[1]
summary_df_sm = summary_df_sm[['Coef.', 'Std.Err.', 't', 'P>|t|']]

# Rename columns for comparison
summary_df_sm.columns = ['Coefficient', 'Standard Error', 'T-score', 'P-value']

print("Statsmodels Summary Results:")
display(summary_df_sm)

Statsmodels Summary Results:


Unnamed: 0,Coefficient,Standard Error,T-score,P-value
const,2.950214,0.297073,9.930943,2.092662e-16
x1,1.591325,0.323915,4.912787,3.688347e-06
x2,-1.020043,0.326421,-3.124935,0.002352465
x3,0.753231,0.344119,2.188869,0.03103116


In [19]:
# Fit the linear regression model using scikit-learn
model_sk = LinearRegression()
model_sk.fit(X, y)

In [20]:
# Extract the coefficients
coefficients_sk = np.concatenate([[model_sk.intercept_], model_sk.coef_])

# Calculate the residuals
y_pred_sk = model_sk.predict(X)
residuals_sk = y - y_pred_sk

# Calculate the residual sum of squares
rss_sk = np.sum(residuals_sk**2)

# Calculate the variance-covariance matrix
X_with_const_sk = np.hstack([np.ones((X.shape[0], 1)), X])
cov_matrix_sk = np.linalg.inv(X_with_const_sk.T @ X_with_const_sk) * (rss_sk / (n_samples - X_with_const_sk.shape[1]))

# Calculate the standard errors
standard_errors_sk = np.sqrt(np.diag(cov_matrix_sk))

# Calculate t-scores
t_scores_sk = coefficients_sk / standard_errors_sk

# Calculate p-values
p_values_sk = 2 * (1 - stats.t.cdf(np.abs(t_scores_sk), df=n_samples - X_with_const_sk.shape[1]))

# Create a DataFrame to store the results
results_df_sk = pd.DataFrame({
    'Coefficient': coefficients_sk,
    'Standard Error': standard_errors_sk,
    'T-score': t_scores_sk,
    'P-value': p_values_sk
})

print("Calculated Results Using Sklearn:")
display(results_df_sk)

Calculated Results Using Sklearn:


Unnamed: 0,Coefficient,Standard Error,T-score,P-value
0,2.950214,0.297073,9.930943,2.220446e-16
1,1.591325,0.323915,4.912787,3.688347e-06
2,-1.020043,0.326421,-3.124935,0.002352465
3,0.753231,0.344119,2.188869,0.03103116


In [22]:
# Compare the two DataFrames
comparison_df = results_df_sm.join(results_df_sk, lsuffix='_sm', rsuffix='_sk')

print("Comparison of Results from Statsmodels and Sklearn:")
display(comparison_df)

Comparison of Results from Statsmodels and Sklearn:


Unnamed: 0,Coefficient_sm,Standard Error_sm,T-score_sm,P-value_sm,Coefficient_sk,Standard Error_sk,T-score_sk,P-value_sk
0,2.950214,0.297073,9.930943,2.220446e-16,2.950214,0.297073,9.930943,2.220446e-16
1,1.591325,0.323915,4.912787,3.688347e-06,1.591325,0.323915,4.912787,3.688347e-06
2,-1.020043,0.326421,-3.124935,0.002352465,-1.020043,0.326421,-3.124935,0.002352465
3,0.753231,0.344119,2.188869,0.03103116,0.753231,0.344119,2.188869,0.03103116


# Poisson Regression

In [23]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import PoissonRegressor

# Generate synthetic data
np.random.seed(0)
n_samples = 100
X = np.random.rand(n_samples, 3)  # 3 predictors
y = np.random.poisson(lam=np.exp(1 + 0.5*X[:, 0] - 0.2*X[:, 1] + 0.3*X[:, 2]))  # Poisson-distributed target variable

In [24]:
# Add a constant (intercept term) to the model
X_with_const = sm.add_constant(X)

# Fit the Poisson regression model using statsmodels
model_sm = sm.GLM(y, X_with_const, family=sm.families.Poisson())
result_sm = model_sm.fit()

In [25]:
# Extract the coefficients
coefficients_sm = result_sm.params

# Calculate the standard errors from the covariance matrix
cov_matrix_sm = result_sm.cov_params()
standard_errors_sm = np.sqrt(np.diag(cov_matrix_sm))

# Calculate Z-scores
z_scores_sm = coefficients_sm / standard_errors_sm

# Calculate p-values
p_values_sm = 2 * (1 - stats.norm.cdf(np.abs(z_scores_sm)))

# Create a DataFrame to store the results
results_df_sm = pd.DataFrame({
    'Coefficient': coefficients_sm,
    'Standard Error': standard_errors_sm,
    'Z-score': z_scores_sm,
    'P-value': p_values_sm
})

print("Calculated Results Using Statsmodels:")
display(results_df_sm)

Calculated Results Using Statsmodels:


Unnamed: 0,Coefficient,Standard Error,Z-score,P-value
0,1.219651,0.165116,7.386625,1.505462e-13
1,0.720303,0.179761,4.007009,6.149243e-05
2,-0.50449,0.180362,-2.797097,0.005156402
3,-0.093011,0.186463,-0.49882,0.617906


In [26]:
# Compare with statsmodels summary output
summary_df_sm = result_sm.summary2().tables[1]
summary_df_sm = summary_df_sm[['Coef.', 'Std.Err.', 'z', 'P>|z|']]

# Rename columns for comparison
summary_df_sm.columns = ['Coefficient', 'Standard Error', 'Z-score', 'P-value']

print("Statsmodels Summary Results:")
display(summary_df_sm)

Statsmodels Summary Results:


Unnamed: 0,Coefficient,Standard Error,Z-score,P-value
const,1.219651,0.165116,7.386625,1.506028e-13
x1,0.720303,0.179761,4.007009,6.149243e-05
x2,-0.50449,0.180362,-2.797097,0.005156402
x3,-0.093011,0.186463,-0.49882,0.617906


In [27]:
# Fit the Poisson regression model using scikit-learn
model_sk = PoissonRegressor(alpha=0)
model_sk.fit(X, y)

In [28]:
# Extract the coefficients
coefficients_sk = np.concatenate([[model_sk.intercept_], model_sk.coef_])

# Add a constant (intercept term) to the design matrix
X_with_const_sk = np.hstack([np.ones((X.shape[0], 1)), X])

# Calculate the predicted means
pred_means_sk = model_sk.predict(X)

# Calculate the diagonal matrix of weights
W = np.diag(pred_means_sk)

# Calculate the variance-covariance matrix
cov_matrix_sk = np.linalg.inv(X_with_const_sk.T @ W @ X_with_const_sk)

# Calculate the standard errors
standard_errors_sk = np.sqrt(np.diag(cov_matrix_sk))

# Calculate Z-scores
z_scores_sk = coefficients_sk / standard_errors_sk

# Calculate p-values
p_values_sk = 2 * (1 - stats.norm.cdf(np.abs(z_scores_sk)))

# Create a DataFrame to store the results
results_df_sk = pd.DataFrame({
    'Coefficient': coefficients_sk,
    'Standard Error': standard_errors_sk,
    'Z-score': z_scores_sk,
    'P-value': p_values_sk
})

print("Calculated Results Using Sklearn:")
display(results_df_sk)

Calculated Results Using Sklearn:


Unnamed: 0,Coefficient,Standard Error,Z-score,P-value
0,1.219651,0.165117,7.386588,1.505462e-13
1,0.72033,0.179762,4.00714,6.145848e-05
2,-0.50451,0.180363,-2.797191,0.005154908
3,-0.093039,0.186464,-0.498964,0.6178044


In [30]:
# Compare the two DataFrames
comparison_df = results_df_sm.join(results_df_sk, lsuffix='_sm', rsuffix='_sk')

print("\nComparison of Results from Statsmodels and Sklearn:")
display(comparison_df)


Comparison of Results from Statsmodels and Sklearn:


Unnamed: 0,Coefficient_sm,Standard Error_sm,Z-score_sm,P-value_sm,Coefficient_sk,Standard Error_sk,Z-score_sk,P-value_sk
0,1.219651,0.165116,7.386625,1.505462e-13,1.219651,0.165117,7.386588,1.505462e-13
1,0.720303,0.179761,4.007009,6.149243e-05,0.72033,0.179762,4.00714,6.145848e-05
2,-0.50449,0.180362,-2.797097,0.005156402,-0.50451,0.180363,-2.797191,0.005154908
3,-0.093011,0.186463,-0.49882,0.617906,-0.093039,0.186464,-0.498964,0.6178044


# Custom Functions

## Scikit-Learn

In [39]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import LogisticRegression, LinearRegression, PoissonRegressor

def create_results_dataframe(params, standard_errors, scores, p_values, lower_ci, upper_ci, param_names):
    # Create and return a DataFrame with the parameters and statistics
    results_df = pd.DataFrame({
        'Parameters': param_names,
        'Estimate': params,
        'Standard Error': standard_errors,
        'Score': scores,
        'P-value': p_values,
        'Lower CI': lower_ci,
        'Upper CI': upper_ci
    })
    return results_df

def calculate_logistic_statistics(model, X, params, format_pvals, param_names):
    # Add a constant (intercept term) to the design matrix
    n = X.shape[0]
    X_with_const = np.hstack([np.ones((n, 1)), X])

    # Calculate predicted probabilities
    pred_probs = model.predict_proba(X)[:, 1]

    # Calculate the diagonal matrix of weights
    pred_var = pred_probs * (1 - pred_probs)
    W = np.diag(pred_var)

    # Calculate the variance-covariance matrix
    XTWX = X_with_const.T @ W @ X_with_const
    cov_matrix = np.linalg.inv(XTWX)

    # Calculate the standard errors
    standard_errors = np.sqrt(np.diag(cov_matrix))

    # Calculate z-scores
    z_scores = params / standard_errors

    # Calculate p-values
    p_values = 2 * (1 - stats.norm.cdf(np.abs(z_scores)))

    # Format p-values
    if format_pvals:
        p_values = pd.Series(p_values).apply(lambda x: f"{x:.6f}" if x >= 0.0001 else f"{x:.6e}")

    # Calculate confidence intervals
    lower_ci = params - 1.96 * standard_errors
    upper_ci = params + 1.96 * standard_errors

    # Create and return a DataFrame with the results
    created_results_df = create_results_dataframe(params, standard_errors, z_scores, p_values, lower_ci, upper_ci, param_names)
    return created_results_df

def calculate_poisson_statistics(model, X, params, format_pvals, param_names):
    # Add a constant (intercept term) to the design matrix
    n = X.shape[0]
    X_with_const = np.hstack([np.ones((n, 1)), X])

    # Calculate predicted means
    pred_means = model.predict(X)

    # Calculate the diagonal matrix of weights
    W = np.diag(pred_means)

    # Calculate the variance-covariance matrix
    XTWX = X_with_const.T @ W @ X_with_const
    cov_matrix = np.linalg.inv(XTWX)

    # Calculate the standard errors
    standard_errors = np.sqrt(np.diag(cov_matrix))

    # Calculate z-scores
    z_scores = params / standard_errors

    # Calculate p-values
    p_values = 2 * (1 - stats.norm.cdf(np.abs(z_scores)))

    # Format p-values
    if format_pvals:
        p_values = pd.Series(p_values).apply(lambda x: f"{x:.6f}" if x >= 0.0001 else f"{x:.6e}")

    # Calculate confidence intervals
    lower_ci = params - 1.96 * standard_errors
    upper_ci = params + 1.96 * standard_errors

    # Create and return a DataFrame with the results
    created_results_df = create_results_dataframe(params, standard_errors, z_scores, p_values, lower_ci, upper_ci, param_names)
    return created_results_df

def calculate_linear_statistics(model, X, params, y, format_pvals, param_names):
    # Add a constant (intercept term) to the design matrix
    n = X.shape[0]
    X_with_const = np.hstack([np.ones((n, 1)), X])

    # Calculate residuals
    y_pred = model.predict(X)
    residuals = y - y_pred

    # Calculate the residual sum of squares
    rss = np.sum(residuals**2)

    # Calculate the inverse of X^T * X
    XTX = X_with_const.T @ X_with_const
    XTX_inv = np.linalg.inv(XTX)

    # Calculate the variance-covariance matrix
    p = X.shape[1]
    sigma_squared = rss / (n - p - 1)
    cov_matrix = XTX_inv * sigma_squared

    # Calculate the standard errors
    standard_errors = np.sqrt(np.diag(cov_matrix))

    # Calculate t-scores
    t_scores = params / standard_errors

    # Calculate p-values
    p_values = 2 * (1 - stats.t.cdf(np.abs(t_scores), df=n - p - 1))

    # Format p-values
    if format_pvals:
        p_values = pd.Series(p_values).apply(lambda x: f"{x:.6f}" if x >= 0.0001 else f"{x:.6e}")

    # Calculate confidence intervals
    critical_value = stats.t.ppf(0.975, df=n - p - 1)
    lower_ci = params - critical_value * standard_errors
    upper_ci = params + critical_value * standard_errors

    # Create and return the results DataFrame
    created_results_df = create_results_dataframe(params, standard_errors, t_scores, p_values, lower_ci, upper_ci, param_names)
    return created_results_df

In [40]:
def get_regression_statistics_sk(model, X, y, format_pvals=False):
    """
    Calculate regression statistics including params, standard errors, t-scores/z-scores, and p-values.

    Parameters:
    model (sklearn estimator): The already-fitted scikit-learn estimator.
    X (ndarray): The input feature matrix.
    y (ndarray): The target vector.
    format_pvals (bool): Whether to format p-values. Standard notation will be used if greater than 0.0001.

    Returns:
    pd.DataFrame: A DataFrame with params, standard errors, t-scores/z-scores, p-values, and confidence intervals.
    """

    # Get parameter names
    n_vars = X.shape[1]
    param_names = ['Intercept'] + [f'X{i}' for i in range(1, n_vars + 1)]

    # Extract parameters and the create model-specific DataFrame
    if isinstance(model, LogisticRegression):
        intercept = [model.intercept_[0]]
        coefs = model.coef_.flatten()
        params = np.concatenate([intercept, coefs])
        logistic_stats_df = calculate_logistic_statistics(model, X, params, format_pvals, param_names)
        return logistic_stats_df
    elif isinstance(model, PoissonRegressor):
        intercept = [model.intercept_]
        coefs = model.coef_.flatten()
        params = np.concatenate([intercept, coefs])
        poisson_stats_df = calculate_poisson_statistics(model, X, params, format_pvals, param_names)
        return poisson_stats_df
    elif isinstance(model, LinearRegression):
        intercept = [model.intercept_]
        coefs = model.coef_.flatten()
        params = np.concatenate([intercept, coefs])
        linear_stats_df = calculate_linear_statistics(model, X, params, y, format_pvals, param_names)
        return linear_stats_df
    else:
        raise ValueError("Invalid model type. Supported models are LogisticRegression, LinearRegression, and PoissonRegressor.")

In [41]:
# Generate synthetic data
np.random.seed(0)
n_samples = 100
X = np.random.rand(n_samples, 3)  # 3 predictors
y_binary = np.random.binomial(1, p=0.5, size=n_samples)  # Binary target variable
y_continuous = 3 + 2*X[:, 0] - 1*X[:, 1] + 0.5*X[:, 2] + np.random.randn(n_samples)  # Continuous target variable
y_poisson = np.random.poisson(lam=np.exp(1 + 0.5*X[:, 0] - 0.2*X[:, 1] + 0.3*X[:, 2]))  # Poisson-distributed target variable

# Set display options to control output format
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)  # Increase the width to prevent wrapping
pd.set_option('display.max_colwidth', None)  # Display full column width

In [42]:
# Logistic Regression
logit_model_sk = LogisticRegression(solver='lbfgs').fit(X, y_binary)
logit_results_sk = get_regression_statistics_sk(logit_model_sk, X, y_binary)
print("Logistic Regression Results:")
display(logit_results_sk)

# Linear Regression
linreg_model_sk = LinearRegression().fit(X, y_continuous)
linreg_results_sk = get_regression_statistics_sk(linreg_model_sk, X, y_continuous, format_pvals=True)
print("\nLinear Regression Results:")
display(linreg_results_sk)

# Poisson Regression
poisson_model_sk = PoissonRegressor(alpha=0).fit(X, y_poisson)
poisson_results_sk = get_regression_statistics_sk(poisson_model_sk, X, y_poisson)
print("\nPoisson Regression Results:")
display(poisson_results_sk)

# Reset display options to default after displaying the comparisons
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

Logistic Regression Results:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,-0.350217,0.630858,-0.555143,0.578797,-1.586699,0.886265
1,X1,-5.5e-05,0.687176,-8.1e-05,0.999936,-1.34692,1.34681
2,X2,0.413576,0.693596,0.596278,0.55099,-0.945873,1.773025
3,X3,-0.10045,0.729564,-0.137685,0.890489,-1.530396,1.329496



Linear Regression Results:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,2.644157,0.284745,9.286064,5.107026e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758238e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655



Poisson Regression Results:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,0.983968,0.166977,5.892845,3.796032e-09,0.656694,1.311242
1,X1,0.636828,0.18033,3.53146,0.0004132729,0.283381,0.990275
2,X2,-0.267671,0.17942,-1.491872,0.1357328,-0.619333,0.083991
3,X3,0.222189,0.186623,1.190577,0.2338197,-0.143592,0.58797


## Statsmodels

In [43]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Poisson, Gaussian

def get_regression_statistics_sm(model, X, y, format_pvals=False):
    """
    Calculate regression statistics including params, standard errors, t-scores/z-scores, p-values, and confidence intervals.

    Parameters:
    model (statsmodels results object): The already-fitted statsmodels model.
    X (ndarray): The input feature matrix.
    y (ndarray): The target vector.
    format_pvals (bool): Whether to format p-values. Standard notation will be used if greater than 0.0001.

    Returns:
    pd.DataFrame: A DataFrame with params, standard errors, t-scores/z-scores, p-values, and confidence intervals.
    """

    # Get parameter names
    n_vars = X.shape[1]
    param_names = ['Intercept'] + [f'X{i}' for i in range(1, n_vars + 1)]

    # Extract params, standard errors, and p-values
    params = model.params
    standard_errors = model.bse
    p_values = model.pvalues

    # Format p-values
    if format_pvals==True:
        p_values = pd.Series(p_values).apply(lambda x: f"{x:.6f}" if x >= 0.0001 else f"{x:.6e}")

    # Calculate t-scores/z-scores
    scores = params / standard_errors

    # Calculate confidence intervals
    conf = model.conf_int()
    lower_ci = conf[:, 0]
    upper_ci = conf[:, 1]

    # Create and return the results DataFrame
    created_results_df = create_results_dataframe(params, standard_errors, scores, p_values, lower_ci, upper_ci, param_names)
    return created_results_df

In [44]:
# Logistic Regression
logit_model_sm = Logit(y_binary, X_with_const).fit(disp=0)
logit_results_sm = get_regression_statistics_sm(logit_model_sm, X, y_binary)
print("Logistic Regression Results:")
display(logit_results_sm)

# Linear Regression
linreg_model_sm = sm.OLS(y_continuous, X_with_const).fit()
linreg_results_sm = get_regression_statistics_sm(linreg_model_sm, X, y_continuous, format_pvals=True)
print("\nLinear Regression Results:")
display(linreg_results_sm)

# Poisson Regression
poisson_model_sm = GLM(y_poisson, X_with_const, family=Poisson()).fit()
poisson_results_sm = get_regression_statistics_sm(poisson_model_sm, X, y_poisson)
print("\nPoisson Regression Results:")
display(poisson_results_sm)

# Reset display options to default after displaying the comparisons
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

Logistic Regression Results:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,-0.429624,0.63248,-0.679268,0.496968,-1.669263,0.810015
1,X1,0.003549,0.688721,0.005152,0.995889,-1.346319,1.353417
2,X2,0.608779,0.696332,0.874265,0.381974,-0.756008,1.973566
3,X3,-0.135473,0.730872,-0.185357,0.852949,-1.567956,1.297011



Linear Regression Results:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,2.644157,0.284745,9.286064,5.08746e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758161e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655



Poisson Regression Results:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,0.983877,0.166979,5.892217,3.810474e-09,0.656604,1.31115
1,X1,0.636776,0.180333,3.531115,0.000413812,0.28333,0.990222
2,X2,-0.2678,0.179422,-1.492575,0.1355484,-0.61946,0.08386
3,X3,0.222472,0.186627,1.19207,0.2332337,-0.14331,0.588254


## Comparison

In [45]:
# Set display options to control output format
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)  # Increase the width to prevent wrapping
pd.set_option('display.max_colwidth', None)  # Display full column width

# Merging the results for comparison
logit_comparison = pd.concat([logit_results_sk.add_suffix('_sk'), logit_results_sm.add_suffix('_sm')], axis=1)
linreg_comparison = pd.concat([linreg_results_sk.add_suffix('_sk'), linreg_results_sm.add_suffix('_sm')], axis=1)
poisson_comparison = pd.concat([poisson_results_sk.add_suffix('_sk'), poisson_results_sm.add_suffix('_sm')], axis=1)

# Display the comparison results
print("Logistic Regression Comparison:")
display(logit_comparison)

print("\nLinear Regression Comparison:")
display(linreg_comparison)

print("\nPoisson Regression Comparison:")
display(poisson_comparison)

# Reset display options to default after displaying the comparisons
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

Logistic Regression Comparison:


Unnamed: 0,Parameters_sk,Estimate_sk,Standard Error_sk,Score_sk,P-value_sk,Lower CI_sk,Upper CI_sk,Parameters_sm,Estimate_sm,Standard Error_sm,Score_sm,P-value_sm,Lower CI_sm,Upper CI_sm
0,Intercept,-0.350217,0.630858,-0.555143,0.578797,-1.586699,0.886265,Intercept,-0.429624,0.63248,-0.679268,0.496968,-1.669263,0.810015
1,X1,-5.5e-05,0.687176,-8.1e-05,0.999936,-1.34692,1.34681,X1,0.003549,0.688721,0.005152,0.995889,-1.346319,1.353417
2,X2,0.413576,0.693596,0.596278,0.55099,-0.945873,1.773025,X2,0.608779,0.696332,0.874265,0.381974,-0.756008,1.973566
3,X3,-0.10045,0.729564,-0.137685,0.890489,-1.530396,1.329496,X3,-0.135473,0.730872,-0.185357,0.852949,-1.567956,1.297011



Linear Regression Comparison:


Unnamed: 0,Parameters_sk,Estimate_sk,Standard Error_sk,Score_sk,P-value_sk,Lower CI_sk,Upper CI_sk,Parameters_sm,Estimate_sm,Standard Error_sm,Score_sm,P-value_sm,Lower CI_sm,Upper CI_sm
0,Intercept,2.644157,0.284745,9.286064,5.107026e-15,2.078944,3.209371,Intercept,2.644157,0.284745,9.286064,5.08746e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758238e-12,1.869387,3.101954,X1,2.48567,0.310473,8.006082,2.758161e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655



Poisson Regression Comparison:


Unnamed: 0,Parameters_sk,Estimate_sk,Standard Error_sk,Score_sk,P-value_sk,Lower CI_sk,Upper CI_sk,Parameters_sm,Estimate_sm,Standard Error_sm,Score_sm,P-value_sm,Lower CI_sm,Upper CI_sm
0,Intercept,0.983968,0.166977,5.892845,3.796032e-09,0.656694,1.311242,Intercept,0.983877,0.166979,5.892217,3.810474e-09,0.656604,1.31115
1,X1,0.636828,0.18033,3.53146,0.0004132729,0.283381,0.990275,X1,0.636776,0.180333,3.531115,0.000413812,0.28333,0.990222
2,X2,-0.267671,0.17942,-1.491872,0.1357328,-0.619333,0.083991,X2,-0.2678,0.179422,-1.492575,0.1355484,-0.61946,0.08386
3,X3,0.222189,0.186623,1.190577,0.2338197,-0.143592,0.58797,X3,0.222472,0.186627,1.19207,0.2332337,-0.14331,0.588254


In [47]:
# Function to print the DataFrame in the desired format
def print_split_dataframe(df, suffix_sk='_sk', suffix_sm='_sm'):
    sk_columns = [col for col in df.columns if col.endswith(suffix_sk)]
    sm_columns = [col for col in df.columns if col.endswith(suffix_sm)]

    # Print sk columns
    display(df[sk_columns].rename(columns=lambda x: x.replace(suffix_sk, '')))
    print()
    # Print sm columns
    display(df[sm_columns].rename(columns=lambda x: x.replace(suffix_sm, '')))

# Set display options to control output format
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)  # Increase the width to prevent wrapping
pd.set_option('display.max_colwidth', None)  # Display full column width

# Display the comparison results in the desired format
print("Logistic Regression Comparison:")
print_split_dataframe(logit_comparison)

print("\nLinear Regression Comparison:")
print_split_dataframe(linreg_comparison)

print("\nPoisson Regression Comparison:")
print_split_dataframe(poisson_comparison)

# Reset display options to default after displaying the comparisons
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

Logistic Regression Comparison:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,-0.350217,0.630858,-0.555143,0.578797,-1.586699,0.886265
1,X1,-5.5e-05,0.687176,-8.1e-05,0.999936,-1.34692,1.34681
2,X2,0.413576,0.693596,0.596278,0.55099,-0.945873,1.773025
3,X3,-0.10045,0.729564,-0.137685,0.890489,-1.530396,1.329496





Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,-0.429624,0.63248,-0.679268,0.496968,-1.669263,0.810015
1,X1,0.003549,0.688721,0.005152,0.995889,-1.346319,1.353417
2,X2,0.608779,0.696332,0.874265,0.381974,-0.756008,1.973566
3,X3,-0.135473,0.730872,-0.185357,0.852949,-1.567956,1.297011



Linear Regression Comparison:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,2.644157,0.284745,9.286064,5.107026e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758238e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655





Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,2.644157,0.284745,9.286064,5.08746e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758161e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655



Poisson Regression Comparison:


Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,0.983968,0.166977,5.892845,3.796032e-09,0.656694,1.311242
1,X1,0.636828,0.18033,3.53146,0.0004132729,0.283381,0.990275
2,X2,-0.267671,0.17942,-1.491872,0.1357328,-0.619333,0.083991
3,X3,0.222189,0.186623,1.190577,0.2338197,-0.143592,0.58797





Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,0.983877,0.166979,5.892217,3.810474e-09,0.656604,1.31115
1,X1,0.636776,0.180333,3.531115,0.000413812,0.28333,0.990222
2,X2,-0.2678,0.179422,-1.492575,0.1355484,-0.61946,0.08386
3,X3,0.222472,0.186627,1.19207,0.2332337,-0.14331,0.588254


In [48]:
from IPython.display import display_html

# Function to display DataFrames side by side
def display_side_by_side(dfs, captions, space=30):
    html_str = ''
    for df, caption in zip(dfs, captions):
        df_styler = df.style.set_table_attributes(f"style='display:inline;margin-right:{space}px;'").set_caption(caption)
        html_str += df_styler._repr_html_()
    display_html(html_str, raw=True)

# Merging the results for comparison
logit_comparison = pd.concat([logit_results_sk.add_suffix('_sk'), logit_results_sm.add_suffix('_sm')], axis=1)
linreg_comparison = pd.concat([linreg_results_sk.add_suffix('_sk'), linreg_results_sm.add_suffix('_sm')], axis=1)
poisson_comparison = pd.concat([poisson_results_sk.add_suffix('_sk'), poisson_results_sm.add_suffix('_sm')], axis=1)

# Split the comparisons into sk and sm DataFrames
logit_comparison_sk = logit_comparison[[col for col in logit_comparison.columns if col.endswith('_sk')]].rename(columns=lambda x: x.replace('_sk', ''))
logit_comparison_sm = logit_comparison[[col for col in logit_comparison.columns if col.endswith('_sm')]].rename(columns=lambda x: x.replace('_sm', ''))

linreg_comparison_sk = linreg_comparison[[col for col in linreg_comparison.columns if col.endswith('_sk')]].rename(columns=lambda x: x.replace('_sk', ''))
linreg_comparison_sm = linreg_comparison[[col for col in linreg_comparison.columns if col.endswith('_sm')]].rename(columns=lambda x: x.replace('_sm', ''))

poisson_comparison_sk = poisson_comparison[[col for col in poisson_comparison.columns if col.endswith('_sk')]].rename(columns=lambda x: x.replace('_sk', ''))
poisson_comparison_sm = poisson_comparison[[col for col in poisson_comparison.columns if col.endswith('_sm')]].rename(columns=lambda x: x.replace('_sm', ''))

# Display the comparison results side by side
print("Logistic Regression Comparison:\n")
display_side_by_side([logit_comparison_sk, logit_comparison_sm], ['Sklearn Results', 'Statsmodels Results'])

print("\n\nLinear Regression Comparison:\n")
display_side_by_side([linreg_comparison_sk, linreg_comparison_sm], ['Sklearn Results', 'Statsmodels Results'])

print("\n\nPoisson Regression Comparison:\n")
display_side_by_side([poisson_comparison_sk, poisson_comparison_sm], ['Sklearn Results', 'Statsmodels Results'])

Logistic Regression Comparison:



Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,-0.350217,0.630858,-0.555143,0.578797,-1.586699,0.886265
1,X1,-5.5e-05,0.687176,-8.1e-05,0.999936,-1.34692,1.34681
2,X2,0.413576,0.693596,0.596278,0.55099,-0.945873,1.773025
3,X3,-0.10045,0.729564,-0.137685,0.890489,-1.530396,1.329496

Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,-0.429624,0.63248,-0.679268,0.496968,-1.669263,0.810015
1,X1,0.003549,0.688721,0.005152,0.995889,-1.346319,1.353417
2,X2,0.608779,0.696332,0.874265,0.381974,-0.756008,1.973566
3,X3,-0.135473,0.730872,-0.185357,0.852949,-1.567956,1.297011




Linear Regression Comparison:



Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,2.644157,0.284745,9.286064,5.107026e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758238e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655

Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,2.644157,0.284745,9.286064,5.08746e-15,2.078944,3.209371
1,X1,2.48567,0.310473,8.006082,2.758161e-12,1.869387,3.101954
2,X2,-1.424658,0.312875,-4.553447,1.548603e-05,-2.045709,-0.803607
3,X3,0.767931,0.329838,2.328206,0.022,0.113208,1.422655




Poisson Regression Comparison:



Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,0.983968,0.166977,5.892845,0.0,0.656694,1.311242
1,X1,0.636828,0.18033,3.53146,0.000413,0.283381,0.990275
2,X2,-0.267671,0.17942,-1.491872,0.135733,-0.619333,0.083991
3,X3,0.222189,0.186623,1.190577,0.23382,-0.143592,0.58797

Unnamed: 0,Parameters,Estimate,Standard Error,Score,P-value,Lower CI,Upper CI
0,Intercept,0.983877,0.166979,5.892217,0.0,0.656604,1.31115
1,X1,0.636776,0.180333,3.531115,0.000414,0.28333,0.990222
2,X2,-0.2678,0.179422,-1.492575,0.135548,-0.61946,0.08386
3,X3,0.222472,0.186627,1.19207,0.233234,-0.14331,0.588254
