In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
import xgboost as xgb

In [4]:
loan = pd.read_csv("data/loan_pricing_dealscan.csv", low_memory=False)
msf = pd.read_csv("data/MSF_1996_2023.csv", low_memory=False)

In [None]:
import subprocess
from tqdm import tqdm

cols_needed = [
    "gvkey", "datadate", "fyear",
    "at", "lt", "revt", "ni", "oancf", "act", "lct",
    "ppent", "ceq", "dltt", "ch", "invt", "rect", "cogs"
]


total_lines = int(subprocess.check_output(
    ["wc", "-l", "data/COMPUSTAT_funda_annual.csv"]
).split()[0])

chunksize = 200_000

chunks = []
for chunk in tqdm(
    pd.read_csv(
        "data/COMPUSTAT_funda_annual.csv",
        usecols=cols_needed,
        chunksize=chunksize,
        low_memory=False
    ),
    total=total_lines // chunksize
):
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df.to_pickle("data/COMPUSTAT_funda_annual.pkl")
comp = pd.read_pickle("data/COMPUSTAT_funda_annual.pkl")

In [6]:
loan["datadate"] = pd.to_datetime(loan["datadate"], errors="coerce")
loan["facilitystartdate"] = pd.to_datetime(
    loan["facilitystartdate"], errors="coerce")
comp["datadate"] = pd.to_datetime(comp["datadate"], errors="coerce")

In [7]:
loan["year"] = loan["datadate"].dt.year
comp["year"] = comp["datadate"].dt.year

In [8]:
tmp = loan.merge(comp, on=["gvkey","year"], how="left", indicator=True)
print(tmp["_merge"].value_counts())

_merge
both          170007
left_only       8045
right_only         0
Name: count, dtype: int64


In [9]:
g = comp.groupby(["gvkey","year"]).size()
print("comp rows per firm-year:", g.describe())

comp rows per firm-year: count    494636.000000
mean          1.090333
std           0.287307
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           4.000000
dtype: float64


In [10]:
df = loan.merge(comp, on=["gvkey", "year"], how="left")

df["secured"] = df["secured"].map({"Yes": 1, "No": 0})
df["seniority"] = df["seniority"].replace({
    "Senior Secured": 3,
    "Senior": 2,
    "Senior Unsecured": 1,
    "Subordinated": 0
})

In [11]:
print("loan rows:", len(loan))
print("comp rows:", len(comp))

loan rows: 160041
comp rows: 539318


In [12]:
def clean_crsp_ret(x):
    if isinstance(x, str):
        x = x.strip()

    if x in ["", ".", "NA", None]:
        return np.nan

    try:
        return float(x)
    except:
        pass

    delist_map = {
        "A": -0.30, "B": -0.30, "C": -0.30,
        "D": -1.00, "E": -1.00, "F": -1.00, "G": -1.00
    }
    return delist_map.get(x, np.nan)


def rolling_beta(ret, mkt, window=36):
    betas = np.full(len(ret), np.nan)
    for i in range(window, len(ret)):
        y = ret[i-window:i]
        x = mkt[i-window:i]
        valid = np.isfinite(y) & np.isfinite(x)
        if valid.sum() > 20:
            cov = np.cov(y[valid], x[valid])[0, 1]
            var = np.var(x[valid])
            if var > 0:
                betas[i] = cov / var
    return betas


def preprocess_msf(path):
    print("Loading MSF file...")
    msf = pd.read_csv(path, low_memory=False)

    print("Converting dates...")
    msf["date"] = pd.to_datetime(msf["date"], errors="coerce")
    msf = msf.sort_values(["PERMNO", "date"])

    print("Cleaning CRSP returns (RET, vwretd)...")
    msf["RET"] = msf["RET"].apply(clean_crsp_ret).astype(float)
    msf["vwretd"] = msf["vwretd"].apply(clean_crsp_ret).astype(float)

    msf["ret_excess"] = msf["RET"]
    msf["mkt_excess"] = msf["vwretd"]

    print("Computing rolling 36-month CAPM beta...")
    betalist = []

    for permno, grp in msf.groupby("PERMNO"):
        grp = grp.sort_values("date")
        grp["beta"] = rolling_beta(
            grp["ret_excess"].values,
            grp["mkt_excess"].values
        )
        betalist.append(grp)

    msf = pd.concat(betalist)

    print("Computing systematic & idiosyncratic volatility...")
    msf["mkt_vol"] = (
        msf.groupby("PERMNO")["mkt_excess"]
        .transform(lambda x: x.rolling(36).std())
    )

    msf["sys_vol"] = msf["beta"] * msf["mkt_vol"]

    msf["residual"] = msf["ret_excess"] - msf["beta"] * msf["mkt_excess"]

    msf["idio_vol"] = (
        msf.groupby("PERMNO")["residual"]
        .transform(lambda x: x.rolling(36).std())
    )

    print("Aggregating to annual level...")
    msf["year"] = msf["date"].dt.year

    msf_agg = msf.groupby(["PERMNO", "year"]).agg({
        "RET": "mean",
        "beta": "mean",
        "sys_vol": "mean",
        "idio_vol": "mean"
    }).reset_index()

    print("Finished MSF preprocessing!")
    return msf_agg

In [13]:
msf_agg = preprocess_msf("data/MSF_1996_2023.csv")
df = df.merge(msf_agg, on=["PERMNO", "year"], how="left")

Loading MSF file...
Converting dates...
Cleaning CRSP returns (RET, vwretd)...
Computing rolling 36-month CAPM beta...
Computing systematic & idiosyncratic volatility...
Aggregating to annual level...
Finished MSF preprocessing!


In [14]:
print("msf_agg rows:", len(msf_agg))
print("df rows AFTER loan-comp merge:", len(loan.merge(comp, on=["gvkey","year"], how="left")))
print("df rows FINAL (after msf merge):", len(df))

msf_agg rows: 145761
df rows AFTER loan-comp merge: 178052
df rows FINAL (after msf merge): 178052


In [15]:
print("Comp duplicates:", comp.duplicated(subset=["gvkey", "year"]).sum())
print("MSF duplicates:", msf_agg.duplicated(subset=["PERMNO", "year"]).sum())
print("Loan duplicates:", loan.duplicated(subset=["gvkey", "year"]).sum())

Comp duplicates: 44682
MSF duplicates: 0
Loan duplicates: 136066


In [16]:
def industry_bucket(sic):
    try:
        sic = int(sic)
        if 1 <= sic <= 999:
            return "Agriculture"
        if 1000 <= sic <= 1499:
            return "Mining"
        if 1500 <= sic <= 1799:
            return "Construction"
        if 2000 <= sic <= 3999:
            return "Manufacturing"
        if 4000 <= sic <= 4999:
            return "Transportation"
        if 5000 <= sic <= 5199:
            return "Wholesale"
        if 5200 <= sic <= 5999:
            return "Retail"
        if 6000 <= sic <= 6799:
            return "Finance"
        if 7000 <= sic <= 8999:
            return "Services"
        if 9000 <= sic <= 9999:
            return "Public"
        return "Unknown"
    except:
        return "Unknown"


df["industry"] = df["sic"].apply(industry_bucket)
df = pd.get_dummies(df, columns=["industry"], drop_first=True)

In [17]:
import pandas_datareader.data as fred
from datetime import datetime

start = datetime(1996, 1, 1)
end = datetime(2023, 12, 31)

macros = {"FEDFUNDS": "FEDFUNDS", "GDP": "GDP", "UNRATE": "UNRATE"}
macro_df = pd.DataFrame()

for col, series in macros.items():
    macro_df[col] = fred.DataReader(series, "fred", start, end)

macro_df.index = pd.to_datetime(macro_df.index)
macro_df["year"] = macro_df.index.year

macro_agg = macro_df.groupby("year").agg({
    "FEDFUNDS": "mean",
    "GDP": "mean",
    "UNRATE": "mean"
}).reset_index()

macro_agg.to_csv("data/macro_fred_auto.csv", index=False)

In [18]:
usrec = fred.DataReader("USREC", "fred", start, end)
usrec.index = pd.to_datetime(usrec.index)
usrec["year"] = usrec.index.year

usrec_agg = usrec.groupby("year").mean().reset_index()

In [19]:
macro = pd.read_csv("data/macro_fred_auto.csv")
macro = macro.merge(usrec_agg, on="year", how="left")

df = df.merge(macro, on="year", how="left")

In [20]:
loan_features = ["facilityamt", "maturity", "secured", "seniority", "averagelife"]

ratio_features = [
    "at", "lt", "revt", "ni", "oancf", "act", "lct", "ppent",
    "ceq", "dltt", "ch", "invt", "rect", "cogs"
]

market_features = ["beta", "idio_vol", "sys_vol"]
macro_features = ["FEDFUNDS", "GDP", "UNRATE", "USREC"]

all_features = loan_features + ratio_features + market_features + macro_features

target_main = "allindrawn"
target_aux = "allinundrawn"

In [21]:
df = df[["year"] + all_features + [target_main, target_aux]]

In [22]:
full = df.dropna()

X = full[all_features].values
y_main = full[target_main].values
y_aux = full[target_aux].values

In [23]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
train_idx = full["year"] <= 2017
test_idx = full["year"] >= 2018

X_train = X_scaled[train_idx]
X_test = X_scaled[test_idx]

y_main_train = y_main[train_idx]
y_main_test = y_main[test_idx]

y_aux_train = y_aux[train_idx]
y_aux_test = y_aux[test_idx]

In [25]:
hidden_options = [32, 64, 128]
lr_options = [1e-4, 1e-3, 1e-2]
activations = ["relu", "tanh", "logistic"]

best_model = None
best_mse = 1e18
best_params = None

for h in hidden_options:
    for lr in lr_options:
        for act in activations:
            model = MLPRegressor(
                hidden_layer_sizes=(h,),
                activation=act,
                learning_rate_init=lr,
                max_iter=500,
                random_state=0
            )
            model.fit(X_train, y_main_train)
            pred = model.predict(X_test)
            mse = mean_squared_error(y_main_test, pred)

            if mse < best_mse:
                best_mse = mse
                best_params = (h, lr, act)
                best_model = model

print("Best MLP Params:", best_params)
print("Best Test MSE:", best_mse)



Best MLP Params: (32, 0.01, 'logistic')
Best Test MSE: 2204.34063478547




In [26]:
import tensorflow as tf
from tensorflow.keras import layers, models
tf.keras.backend.clear_session()

In [27]:
h, lr, _ = best_params

input_dim = X_train.shape[1]

inp = layers.Input(shape=(input_dim,))
h1 = layers.Dense(h, activation="relu")(inp)

out_main = layers.Dense(1, name="main_output")(h1)
out_aux = layers.Dense(1, name="aux_output")(h1)

In [28]:
model2 = models.Model(inputs=inp, outputs=[out_main, out_aux])

In [29]:
val_size = int(0.2 * X_train.shape[0])

X_val_tf = X_train[-val_size:]
X_train_tf = X_train[:-val_size]

y_main_val = y_main_train[-val_size:]
y_main_tf = y_main_train[:-val_size]

y_aux_val = y_aux_train[-val_size:]
y_aux_tf = y_aux_train[:-val_size]

In [30]:
X_train_tf = np.asarray(X_train_tf, dtype=np.float32)
X_val_tf = np.asarray(X_val_tf, dtype=np.float32)

y_main_tf = np.asarray(y_main_tf, dtype=np.float32)
y_aux_tf = np.asarray(y_aux_tf, dtype=np.float32)

y_main_val = np.asarray(y_main_val, dtype=np.float32)
y_aux_val = np.asarray(y_aux_val, dtype=np.float32)

In [31]:
print("Train size:", X_train_tf.shape)
print("Val size:", X_val_tf.shape)
print("Test size:", X_test.shape)

Train size: (603, 26)
Val size: (150, 26)
Test size: (469, 26)


In [32]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (X_train_tf, (y_main_tf, y_aux_tf))
).batch(32)

val_ds = tf.data.Dataset.from_tensor_slices(
    (X_val_tf, (y_main_val, y_aux_val))
).batch(32)

In [33]:
print("X_train_tf shape:", X_train_tf.shape)
print("y_main_tf shape:", y_main_tf.shape)
print("y_aux_tf shape:", y_aux_tf.shape)

print("Unique lengths:",
      len(X_train_tf),
      len(y_main_tf),
      len(y_aux_tf))

X_train_tf shape: (603, 26)
y_main_tf shape: (603,)
y_aux_tf shape: (603,)
Unique lengths: 603 603 603


In [34]:
import math
print("TF TRAIN batches:", math.ceil(len(X_train_tf) / 32))
print("TF VAL batches:", math.ceil(len(X_val_tf) / 32))

TF TRAIN batches: 19
TF VAL batches: 5


In [35]:
model2.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
    loss={"main_output": "mse", "aux_output": "mse"},
    loss_weights={"main_output": 1.0, "aux_output": 0.3},
    run_eagerly=True
)

In [36]:
hist = model2.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    verbose=0
)

In [37]:
pred_main, pred_aux = model2.predict(X_test)

print("Aux MLP Main MSE:", mean_squared_error(y_main_test, pred_main))
print("Aux MLP Aux MSE:", mean_squared_error(y_aux_test, pred_aux))

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Aux MLP Main MSE: 5500.273787436264
Aux MLP Aux MSE: 255.0631983125803


In [38]:
results = {}

In [39]:

lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_main_train)
pred = lasso.predict(X_test)
results["LASSO"] = mean_squared_error(y_main_test, pred)

  model = cd_fast.enet_coordinate_descent(


In [40]:
xg = xgb.XGBRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=4
)
xg.fit(X_train, y_main_train)
pred = xg.predict(X_test)
results["XGBoost"] = mean_squared_error(y_main_test, pred)
results["XGBoost_pred"] = pred

In [41]:
train_ds = lgb.Dataset(X_train, y_main_train)
params = {"objective": "regression", "learning_rate": 0.05}
lgb_model = lgb.train(params, train_ds, 300)
pred = lgb_model.predict(X_test)
results["LightGBM"] = mean_squared_error(y_main_test, pred)
results["LightGBM_pred"] = pred

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2854
[LightGBM] [Info] Number of data points in the train set: 753, number of used features: 25
[LightGBM] [Info] Start training from score 185.717131


In [42]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_main_train)
pred = knn.predict(X_test)
results["KNN"] = mean_squared_error(y_main_test, pred)
results["KNN_pred"] = pred

In [43]:
pred = best_model.predict(X_test)
results["MLP Baseline"] = mean_squared_error(y_main_test, pred)
results["MLP Baseline_pred"] = pred

In [44]:
results["MLP+Aux"] = mean_squared_error(y_main_test, pred_main)
results["MLP+Aux_pred"] = pred_main

In [56]:
print("\nModel Comparison MSE:\n")
for k, v in results.items():
    if isinstance(v, (int, float, np.floating)):
      print(f"{k:15s}: {v:.4f}")



Model Comparison MSE:

LASSO          : 245867.7859
XGBoost        : 4689.1856
LightGBM       : 2377.3736
KNN            : 2870.6434
MLP Baseline   : 2204.3406
MLP+Aux        : 5500.2738


In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from io import BytesIO
import base64

def plot_pred_vs_actual_img(y_true, y_pred, title):
    plt.figure(figsize=(7,6))
    sns.scatterplot(x=y_true, y=y_pred, alpha=0.5, s=30)
    plt.plot([y_true.min(), y_true.max()],
             [y_true.min(), y_true.max()],
             color="red", linewidth=2)
    plt.title(title)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    return fig_to_base64_and_close()

def plot_residuals_img(y_true, y_pred, title):
    residuals = y_true - y_pred
    plt.figure(figsize=(7,6))
    sns.histplot(residuals, kde=True)
    plt.title(f"{title} Residual Distribution")
    plt.xlabel("Residual")
    return fig_to_base64_and_close()

def fig_to_base64_and_close():
    buf = BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight")
    buf.seek(0)
    img = base64.b64encode(buf.getvalue()).decode()
    plt.close()
    return img

def to_1d(x):
    x = np.array(x)
    return x.ravel()

In [58]:
plots = {}

pairs = {
    "MLP Baseline": (y_main_test, pred),
    "MLP+Aux": (y_main_test, pred_main),
    "XGBoost": (y_main_test, results["XGBoost_pred"]),
    "LightGBM": (y_main_test, results["LightGBM_pred"]),
    "KNN": (y_main_test, results["KNN_pred"])
}

for name, (yt, yp) in pairs.items():
    plots[name + "_pred"] = plot_pred_vs_actual_img(
        np.array(yt).ravel(),
        np.array(yp).ravel(),
        f"{name}: Prediction vs Actual"
    )

for name, (yt, yp) in pairs.items():
    plots[name + "_resid"] = plot_residuals_img(
        np.array(yt).ravel(),
        np.array(yp).ravel(),
        name
    )

xgb.plot_importance(xg, max_num_features=15, height=0.5)
plt.title("XGBoost Feature Importance")
plots["xgb_importance"] = fig_to_base64_and_close()

lgb.plot_importance(lgb_model, max_num_features=15)
plt.title("LightGBM Feature Importance")
plots["lgb_importance"] = fig_to_base64_and_close()

plt.figure(figsize=(14, 10))
corr = full[all_features].corr()
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plots["corr_heatmap"] = fig_to_base64_and_close()

In [59]:
html = """
<html>
<head>
<title>Loan Pricing ML Report</title>
<style>

body {
    font-family: Arial, sans-serif;
    margin: 20px 60px;
    background: #fafbfc;
    color: #333;
}

h1 {
    font-size: 40px;
    font-weight: 700;
    text-align: center;
    margin-bottom: 40px;
    color: #1d3fa6;
}

h2 {
    margin-top: 50px;
    font-size: 28px;
    color: #1d3fa6;
}

h3 {
    margin-top: 30px;
    font-size: 20px;
    color: #444;
}

.section {
    margin-top: 40px;
}

.table-container {
    margin-top: 20px;
    width: 60%;
    margin-left: auto;
    margin-right: auto;
}

table {
    border-collapse: collapse;
    width: 100%;
    background: white;
    font-size: 16px;
    border-radius: 10px;
    overflow: hidden;
    box-shadow: 0 0 10px rgba(0,0,0,0.1);
}

table th {
    background: #1d3fa6;
    color: white;
    padding: 12px;
}

table td {
    padding: 12px;
    border-bottom: 1px solid #eee;
}

img {
    width: 700px;
    display: block;
    margin: 25px auto;
    padding: 6px;
    background: white;
    border-radius: 8px;
    border: 1px solid #ccc;
}

</style>
</head>
<body>

<h1>Loan Pricing Machine Learning Report</h1>

<h2>Model Performance (Test MSE)</h2>

<div class='table-container'>
<table>
<tr><th>Model</th><th>MSE</th></tr>
"""

for k, v in results.items():
    if not k.endswith("_pred"):
        html += f"<tr><td>{k}</td><td>{v:.4f}</td></tr>"

html += "</table></div>"

html += "<h2>Prediction vs Actual</h2>"
for k, img in plots.items():
    if k.endswith("_pred"):
        label = k.replace("_pred", "")
        html += f"<h3>{label}</h3>"
        html += f"<img src='data:image/png;base64,{img}'/>"

html += "<h2>Residual Distributions</h2>"
for k, img in plots.items():
    if k.endswith("_resid"):
        label = k.replace("_resid", "")
        html += f"<h3>{label}</h3>"
        html += f"<img src='data:image/png;base64,{img}'/>"

html += "<h2>Feature Importance</h2>"
html += "<h3>XGBoost</h3>"
html += f"<img src='data:image/png;base64,{plots['xgb_importance']}'/>"
html += "<h3>LightGBM</h3>"
html += f"<img src='data:image/png;base64,{plots['lgb_importance']}'/>"

html += "<h2>Correlation Heatmap</h2>"
html += f"<img src='data:image/png;base64,{plots['corr_heatmap']}'/>"

html += "</body></html>"

with open("loan_pricing_report.html", "w") as f:
    f.write(html)

print("Saved loan_pricing_report.html")

Saved loan_pricing_report.html
