In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# 1. Synthesise a full dataset
np.random.seed(0)
n_samples = 300
p = 5           # number of features

# Features X ~ N(0,1)
X = np.random.randn(n_samples, p)

# True linear model + noise
true_coef = np.linspace(0.5, 2.5, p)      # [0.5, 1.0, …, 2.5]
noise     = 0.3 * np.random.randn(n_samples)
Y         = X.dot(true_coef) + noise

# 2. Split into training / testing
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.4, random_state=42
)

# 3. Fit two regressors on training set
model1 = LinearRegression()
model2 = DecisionTreeRegressor(max_depth=5, random_state=1)
model1.fit(X_train, Y_train)
model2.fit(X_train, Y_train)

# 4. Predict on testing set
Y1_pred = model1.predict(X_test)
Y2_pred = model2.predict(X_test)

# 5. Build pseudo-labels y_plus
abs_err1 = np.abs(Y1_pred - Y_test)
abs_err2 = np.abs(Y2_pred - Y_test)
mask     = abs_err1 <= abs_err2
Y_plus   = np.where(mask, Y1_pred, Y2_pred)

# 6. Assemble df_test
columns = [f"x{i+1}" for i in range(p)] + ["y", "y1", "y2", "y_plus"]
df_test = pd.DataFrame(
    np.hstack([
        X_test,
        Y_test.reshape(-1,1),
        Y1_pred.reshape(-1,1),
        Y2_pred.reshape(-1,1),
        Y_plus.reshape(-1,1)
    ]),
    columns=columns
)

print("Test data snapshot:")
print(df_test.head())

In [None]:
# 1. Assume df_test exists with columns x1…xp, y, y1, y2, y_plus
df = df_test.copy()

# 2. Identify feature columns and set p
feat_cols = [c for c in df.columns if c.startswith("x")]
p         = len(feat_cols)

# 3. Split into D1, D2, D3 (each ~1/3 of df_test)
D1, temp = train_test_split(df, test_size=2/3, random_state=0)
D2, D3   = train_test_split(temp, test_size=0.5, random_state=0)

# 4. Estimate β̂ via OLS on D1
X1 = D1[feat_cols].values    # shape (n1 × p)
y1 = D1["y"].values          # shape (n1,)
ols = LinearRegression()
ols.fit(X1, y1)
beta_hat = ols.coef_         # shape (p,)

# 5. Compute e1 on D2:
X2   = D2[feat_cols].values
y2   = D2["y"].values
res2 = X2.dot(beta_hat) - y2   # residuals
# e1 is (1/n2) ∑ Xi Xiᵀ ⋅ (Xiᵀβ̂ - Yi)
e1 = sum(np.outer(xi, xi)*ri for xi, ri in zip(X2, res2)) / len(X2)

# 6. Compute f1 on D3:
X3   = D3[feat_cols].values
y3   = D3["y"].values
res3 = X3.dot(beta_hat) - y3
# f1_i = Xi * (Xiᵀβ̂ - Yi)
f1 = X3 * res3[:, None]        # shape (|D3| × p)

# 7. Output
print("β̂ vector =", beta_hat)
print("\ne1 matrix (p×p) =\n", e1)
print("\nFirst 5 rows of f1 on D3:\n", f1[:5])