# EDFB - Digital Finance & Banking - Linear Probability Model and Logistic Regression


---


The following script provides examples on how to model binary outcomes using both linear probability models and logistic regression in Python. We'll start with the linear probability model to understand its limitations, then move to logistic regression as a more appropriate approach for binary dependent variables. In order to run the script, you need to download the dataset "banking.csv".




In [None]:
# Import libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
import io
%matplotlib inline

In [None]:
# To make this notebook's output stable across runs (we make the output reproducable)
np.random.seed(42)

In [None]:
# Import real data from GitHub
banking_url = "https://raw.githubusercontent.com/umatter/EDFB/main/data/banking.csv"
print("Fetching banking.csv from GitHub...")

In [None]:
dataset = pd.read_csv(banking_url)

In [None]:
dataset.shape

In [None]:
dataset.dtypes

In [None]:
# Define set of numerical and categorical variables
num_var = dataset.drop(columns=['y']).select_dtypes([np.number]).columns
cat_var = dataset.drop(columns=['y']).select_dtypes(include=object).columns

In [None]:
num_var

In [None]:
cat_var

In [None]:
# Check NAs
dataset.isna().any()

In [None]:
# Get basic statistics for numerical variables
dataset.describe()

In [None]:
# Check dispersion with box plot
from sklearn import preprocessing
def box_plot(df, standardize=True):

    fig=plt.figure(figsize=(20,10))

    if standardize==True:
        # standardize columns for better visualization
        df=pd.DataFrame(preprocessing.StandardScaler().fit_transform(df.values), columns = df.columns)
    fig=sns.boxplot(x='value', y='variable', data=pd.melt(df.reset_index(), id_vars='index', value_vars=list(df.columns)),
               orient='h')
    fig.tick_params(labelsize=10)
    fig.set_xlabel('')
    fig.set_ylabel('')
    fig.set_title('Note that variables are standardized\nfor better visualization', fontsize=20)
    plt.show()


box_plot(dataset[num_var], standardize=True)

In [None]:
# We remove duration, pdays and previous
dataset=dataset.drop(columns=['duration', 'pdays', 'age', 'campaign', 'previous'])
num_var= dataset.drop(columns=['y']).select_dtypes([np.number]).columns

In [None]:
# Check distribution for target variable
plt.figure(figsize=(10,10))
sns.catplot(x='y', kind="count", data=dataset) # categorical plots
plt.show()

In [None]:
# Dataset is very unbalanced so we remove some observation for y=0 to be equal to 2*size of y=1.
# This is called "undersampling"

# We keep all y=1
from sklearn.model_selection import train_test_split
data_1 = dataset[dataset['y'] == 1]
print(data_1.shape)

# We take y=0 as double the size of data_1
# Moreover we "stratify" the sampling in order to take the same distribution for each variable
# We use the train_test_split function and we keep the test only
all_data_0 = dataset[dataset['y'] == 0]
percentage_corresponding_to_double_size = 2*data_1.shape[0] / all_data_0.shape[0] # 2*size_1 compared to size_0

X = all_data_0.drop(columns=['y'])
y = all_data_0['y'].to_frame()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
data_0_big, data_0_small = train_test_split(all_data_0, test_size=percentage_corresponding_to_double_size,
                                                    random_state=0, shuffle=True)
print(data_0_big.shape) # remaining from the dataset
print(data_0_small.shape)


In [None]:
# Merge two dataset

dataset=pd.concat([data_1, data_0_small], axis= 0).reset_index(drop=True)  # axis = 1 by column and = 0 by row
print(dataset.shape)

In [None]:
# Check distribution for target variable after downsampling

plt.figure(figsize=(10,10))
sns.catplot(x='y', kind="count", data=dataset)
plt.show()

In [None]:
# Plot the distribution and the boxplot of the numerical variables included in the dataset compared to the target (it's only 0 or 1)

fig = plt.figure(figsize=(15,30))
plot_count=1

# scale variable for better visualizing boxplot
dataset_scaled=pd.DataFrame(preprocessing.StandardScaler().fit_transform(dataset[num_var].values),columns = num_var)
dataset_scaled['y']=dataset['y'].astype(str)
y_1 = dataset.loc[dataset['y'] == 1] #.loc - access group of values using labels.
y_0 = dataset.loc[dataset['y'] == 0]

for var in num_var:
    # plot variable distribution
    ax = fig.add_subplot(math.ceil(len(num_var) / 2), 2, plot_count)
    sns.histplot(y_1[var], label='1', ax=ax, alpha=0.7, kde=True)
    sns.histplot(y_0[var], label='0', ax=ax, alpha=0.7, kde=True)
    ax.set_title('Distribution of ' + var, fontsize=20)
    ax.tick_params(labelsize=15)
    ax.set_xlabel('')
    ax.legend(fontsize=16)
    plot_count += 1


In [None]:
# Check the distribution of the levels of the categorical variables compared with the target

fig = plt.figure(figsize=(15,30))
plot_count=1

for var in cat_var:
    # plot variable distribution
    ax = fig.add_subplot(math.ceil(len(cat_var) / 2), 2, plot_count)
    plot_set = dataset.groupby([var, 'y']).size().reset_index().pivot(columns='y', index=var, values=0)
    plot_set=plot_set.div(plot_set.sum(axis=1), axis=0).plot(kind='barh', stacked=True, ax=ax)
    ax.set_title('Target variable distribution for each\nlevel (' + str(len(dataset[var].unique())) +
                 ') of ' + var, fontsize=20)
    ax.tick_params(labelsize=15)
    ax.set_ylabel('')
    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=16)
    plot_count += 1
plt.tight_layout()
plt.show()

In [None]:
dataset.head()

In [None]:
# Create dummy variables & standardize the dataset
dataset_dummy=pd.get_dummies(dataset.copy(), dummy_na=False, drop_first=True) # Whether to get k-1 dummies out of k categorical levels by removing the first level.
dataset_dummy[num_var]=pd.DataFrame(preprocessing.StandardScaler().fit_transform(dataset[num_var].values),columns = num_var)

In [None]:
dataset_dummy.head()

In [None]:
# Check the correlations between variables
corrmat = dataset_dummy.corr()


In [None]:
# Correlation matrix in key-values pairs
corrmat *= np.where(np.tri(*corrmat.shape, k=-1)==0, np.nan, 1)  # puts NaN on upper triangular matrix, including diagonal (k=-1)
corrmat_list=corrmat.unstack().to_frame()

# Check highest correlations
corrmat_list.columns=['correlation']
corrmat_list['abs_corr']=corrmat_list.correlation.abs()
corrmat_list.sort_values(by=['abs_corr'], ascending=False, na_position='last', inplace=True)
corrmat_list.drop(columns=['abs_corr']).head(10)


In [None]:
# Plot correlation heatmap
plt.figure(figsize=(20,20))
sns.heatmap(corrmat, cmap ="YlGnBu", linewidths = 0.1)
plt.show()

In [None]:
# Drop highly correlated columns
dataset_original=dataset.copy() # save original dataset

# Rename dataset_dummy and drop columns
col_to_drop=['emp_var_rate', 'cons_price_idx', 'euribor3m', 'nr_employed', 'loan_unknown', 'housing_unknown']
dataset=dataset_dummy.drop(columns=col_to_drop)
num_var=dataset.columns.intersection(num_var)

In [None]:
# Ready to train and test our models!
X = dataset.drop(columns=['y'])
y = dataset['y'].values
print(X.shape)
print(y.shape)

In [None]:
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=dataset['y'])
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## 1. Linear Probability Model (LPM)

Before diving into logistic regression, let's start with the simpler Linear Probability Model. The LPM treats the binary dependent variable as if it were continuous and applies ordinary least squares (OLS) regression.

**Model specification:** P(y=1|X) = β₀ + β₁X₁ + β₂X₂ + ... + βₖXₖ + ε

While conceptually simple, the LPM has several important limitations that we'll explore.

In [None]:
# Fit Linear Probability Model using OLS
from sklearn.linear_model import LinearRegression

# Train Linear Probability Model
lpm_model = LinearRegression()
lpm_model.fit(X_train, y_train)

# Get predictions
y_train_pred_lpm = lpm_model.predict(X_train)
y_test_pred_lpm = lpm_model.predict(X_test)

print(f"LPM Training R²: {lpm_model.score(X_train, y_train):.4f}")
print(f"LPM Test R²: {lpm_model.score(X_test, y_test):.4f}")

In [None]:
# Examine LPM predictions and identify problems
print("Linear Probability Model - Prediction Statistics:")
print(f"Training set predictions - Min: {y_train_pred_lpm.min():.4f}, Max: {y_train_pred_lpm.max():.4f}")
print(f"Test set predictions - Min: {y_test_pred_lpm.min():.4f}, Max: {y_test_pred_lpm.max():.4f}")
print(f"\nPredictions outside [0,1] range:")
print(f"Training: {np.sum((y_train_pred_lpm < 0) | (y_train_pred_lpm > 1))} out of {len(y_train_pred_lpm)} ({100*np.sum((y_train_pred_lpm < 0) | (y_train_pred_lpm > 1))/len(y_train_pred_lpm):.1f}%)")
print(f"Test: {np.sum((y_test_pred_lpm < 0) | (y_test_pred_lpm > 1))} out of {len(y_test_pred_lpm)} ({100*np.sum((y_test_pred_lpm < 0) | (y_test_pred_lpm > 1))/len(y_test_pred_lpm):.1f}%)")

In [None]:
# Visualize LPM predictions vs actual values
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Histogram of predicted probabilities
ax1.hist(y_test_pred_lpm, bins=30, alpha=0.7, edgecolor='black')
ax1.axvline(x=0, color='red', linestyle='--', label='Probability = 0')
ax1.axvline(x=1, color='red', linestyle='--', label='Probability = 1')
ax1.set_xlabel('Predicted Probability')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of LPM Predicted Probabilities')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Scatter plot of predictions vs actual
ax2.scatter(y_test_pred_lpm, y_test, alpha=0.6)
ax2.plot([0, 1], [0, 1], 'r--', label='Perfect prediction')
ax2.set_xlabel('Predicted Probability (LPM)')
ax2.set_ylabel('Actual Value')
ax2.set_title('LPM: Predicted vs Actual Values')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Convert LPM predictions to binary classifications (using 0.5 threshold)
y_test_pred_lpm_binary = (y_test_pred_lpm >= 0.5).astype(int)

# Calculate accuracy
from sklearn.metrics import accuracy_score
lpm_accuracy = accuracy_score(y_test, y_test_pred_lpm_binary)
print(f"LPM Classification Accuracy: {lpm_accuracy:.4f}")

## Problems with the Linear Probability Model

The Linear Probability Model has several fundamental issues when dealing with binary dependent variables:

### 1. **Predicted probabilities outside [0,1] range**
- As we saw above, LPM can predict negative probabilities or probabilities greater than 1
- This violates the basic definition of probability

### 2. **Heteroskedasticity**
- The error variance is not constant: Var(ε|X) = P(X)[1-P(X)]
- This violates the OLS assumption of homoskedasticity
- Standard errors are biased, affecting hypothesis testing

### 3. **Linear relationship assumption**
- LPM assumes a linear relationship between X and P(y=1|X)
- In reality, the effect of explanatory variables on probability is often non-linear
- Marginal effects are constant across all values of X (unrealistic)

### 4. **Distributional assumptions**
- OLS assumes normally distributed errors
- With binary outcomes, errors follow a Bernoulli distribution

### 5. **Efficiency concerns**
- Due to heteroskedasticity, OLS estimators are not efficient
- Maximum likelihood estimation (as in logistic regression) is more efficient

**Solution:** Use logistic regression, which addresses these issues by:
- Ensuring predicted probabilities stay within [0,1]
- Using the logistic function to model non-linear relationships
- Employing maximum likelihood estimation
- Properly handling the binary nature of the dependent variable

## 2. Logistic Regression

Now let's implement logistic regression, which addresses the limitations of the Linear Probability Model.

**Model specification:** 
- P(y=1|X) = 1 / (1 + e^(-(β₀ + β₁X₁ + β₂X₂ + ... + βₖXₖ)))
- This ensures probabilities remain between 0 and 1
- The relationship between X and P(y=1|X) is non-linear and S-shaped

In [None]:
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(solver='lbfgs', random_state=0) # solver (https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451)
logit_model.fit(X_train, y_train) # training the algorithm

In [None]:
# Get fitted values on test set for logistic regression
y_test_predicted_logit = logit_model.predict(X_test)
y_test_predicted_prob_logit = logit_model.predict_proba(X_test)[:,1]

# Compare LPM vs Logistic Regression predictions
comparison_df = pd.DataFrame({
    'True': y_test.flatten(), 
    'LPM_prob': y_test_pred_lpm.flatten(),
    'LPM_pred': y_test_pred_lpm_binary.flatten(),
    'Logit_prob': y_test_predicted_prob_logit.flatten(), 
    'Logit_pred': y_test_predicted_logit.flatten()
})
display(comparison_df.head(20))

print(f"\nModel Comparison:")
print(f"LPM Accuracy: {accuracy_score(y_test, y_test_pred_lpm_binary):.4f}")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_test_predicted_logit):.4f}")

In [None]:
# Evaluate confusion matrix for Logistic Regression
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_predicted_logit)

In [None]:
# Evaluate confusion matrix

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Only use the labels that appear in the data
    classes = ['0', '1']
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)

    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix for Logistic Regression
plot_confusion_matrix(y_test, y_test_predicted_logit, title='Logistic Regression Confusion Matrix')
plt.show()

In [None]:
# Evaluate precision, recall, F1-score on train set
# A macro-average will compute the metric independently for each class and then take the average (hence treating all classes equally),
# whereas a micro-average will aggregate the contributions of all classes to compute the average metric.
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_predicted_logit))

In [None]:
# Evaluate ROC curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, y_test_predicted_logit)
fpr, tpr, thresholds = roc_curve(y_test, y_test_predicted_prob_logit)


plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()