In [3]:
import numpy as np
import pandas as pd

# Reproducibility
rng = np.random.default_rng(42)
n = 10_000

# Two normal, two uniform
df = pd.DataFrame({
    "normal_1": rng.normal(loc=0, scale=1, size=n),     # N(0,1)
    "normal_2": rng.normal(loc=5, scale=2, size=n),     # N(5, 2^2)
    "uniform_1": rng.uniform(low=0.0, high=1.0, size=n),# U[0,1]
    "uniform_2": rng.uniform(low=-3.0, high=3.0, size=n)# U[-3,3]
})

# quick peek & summary
print(df.head())
print(df.describe())

# save to disk
df.to_csv("random_dataset.csv", index=False)


   normal_1  normal_2  uniform_1  uniform_2
0  0.304717  5.352554   0.362763   2.302144
1 -1.039984  6.798247   0.942406   2.944811
2  0.750451  2.055459   0.255202  -0.083615
3  0.940565  4.429404   0.834961  -0.239142
4 -1.951035  6.658307   0.896856  -1.018130
           normal_1      normal_2     uniform_1     uniform_2
count  10000.000000  10000.000000  10000.000000  10000.000000
mean      -0.010250      5.040703      0.502009      0.004288
std        1.006336      2.005878      0.288007      1.735452
min       -4.389115     -2.775609      0.000047     -2.999835
25%       -0.677280      3.674732      0.253509     -1.516076
50%       -0.013168      5.018178      0.501470      0.026644
75%        0.649971      6.390794      0.754817      1.502683
max        4.025824     13.302483      0.999982      2.999455


In [5]:
a1, a2, a3, a4, a5 = 1.2, -0.7, 0.5, 1.8, 0.6

# Small, mean-zero noise
noise_std = 0.2
epsilon = rng.normal(loc=0.0, scale=noise_std, size=n)

# y = a1*x1 + a2*x2 + a3*x3 + a4*x4 + a5*(x1^2) + epsilon
df["y"] = (
    a1 * df["normal_1"]
    + a2 * df["normal_2"]
    + a3 * df["uniform_1"]
    + a4 * df["uniform_2"]
    + a5 * (df["normal_1"] ** 2)  # squared term
    + epsilon
)

# Quick check & save
print(df.head())
print(df.describe().loc[:, ["normal_1","normal_2","uniform_1","uniform_2","y"]])


   normal_1  normal_2  uniform_1  uniform_2         y
0  0.304717  5.352554   0.362763   2.302144  0.742517
1 -1.039984  6.798247   0.942406   2.944811  0.350154
2  0.750451  2.055459   0.255202  -0.083615 -0.124345
3  0.940565  4.429404   0.834961  -0.239142 -1.446170
4 -1.951035  6.658307   0.896856  -1.018130 -6.026546
           normal_1      normal_2     uniform_1     uniform_2             y
count  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000
mean      -0.010250      5.040703      0.502009      0.004288     -2.678764
std        1.006336      2.005878      0.288007      1.735452      3.762917
min       -4.389115     -2.775609      0.000047     -2.999835    -13.022128
25%       -0.677280      3.674732      0.253509     -1.516076     -5.556518
50%       -0.013168      5.018178      0.501470      0.026644     -2.685474
75%        0.649971      6.390794      0.754817      1.502683      0.077140
max        4.025824     13.302483      0.999982      2.999455     14

In [15]:
import numpy as np

rng = np.random.default_rng(42)
idx = np.arange(len(df))
rng.shuffle(idx)

cut = int(0.7 * len(df))
train_idx, test_idx = idx[:cut], idx[cut:]

train_df = df.iloc[train_idx].reset_index(drop=True)
test_df  = df.iloc[test_idx].reset_index(drop=True)

print(train_df.shape, test_df.shape)


(7000, 5) (3000, 5)


In [17]:
import numpy as np
import pandas as pd

# --- Assumes you already have df with columns:
# normal_1, normal_2, uniform_1, uniform_2, y

# 1) 70/30 split (reproducible)
features = ["normal_1", "normal_2", "uniform_1", "uniform_2"]
rng = np.random.default_rng(42)
idx = np.arange(len(df))
rng.shuffle(idx)
cut = int(0.7 * len(df))
train_idx, test_idx = idx[:cut], idx[cut:]

train_df = df.iloc[train_idx].reset_index(drop=True)
test_df  = df.iloc[test_idx].reset_index(drop=True)

X_train = train_df[features].to_numpy()
y_train = train_df["y"].to_numpy()
X_test  = test_df[features].to_numpy()
y_test  = test_df["y"].to_numpy()

# 2) Add intercept column
X_train_aug = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_aug  = np.c_[np.ones(X_test.shape[0]),  X_test]

# 3) OLS via least squares (more stable than explicit inverse)
# beta = argmin ||X_train_aug * beta - y_train||_2
beta, residuals, rank, s = np.linalg.lstsq(X_train_aug, y_train, rcond=None)

intercept = beta[0]
coefs = pd.Series(beta[1:], index=features)

print("Intercept:", intercept)
print("Coefficients:")
print(coefs)

# 4) Predictions & MSE
yhat_train = X_train_aug @ beta
yhat_test  = X_test_aug  @ beta

mse_train = np.mean((y_train - yhat_train)**2)
mse_test  = np.mean((y_test - yhat_test)**2)

print(f"\nMSE (train): {mse_train:.6f}")
print(f"MSE (test):  {mse_test:.6f}")


Intercept: 0.5769065132960417
Coefficients:
normal_1     1.181553
normal_2    -0.695526
uniform_1    0.519359
uniform_2    1.806909
dtype: float64

MSE (train): 0.814215
MSE (test):  0.754439


In [19]:
import numpy as np
import pandas as pd

# Base data to bootstrap from:
# use df (all data) or train_df (training split) depending on your workflow
base_df = df            # or: base_df = train_df
n_boot = 10
seed = 123

def make_bootstrap_samples(dataframe, n_boot=10, seed=None):
    rng = np.random.default_rng(seed)
    n = len(dataframe)
    samples = []
    for b in range(n_boot):
        idx = rng.integers(0, n, size=n)   # sample with replacement
        boot_df = dataframe.iloc[idx].reset_index(drop=True)
        samples.append(boot_df)
    return samples

boot_samples = make_bootstrap_samples(base_df, n_boot=n_boot, seed=seed)

# Example: inspect the first bootstrap sample
print(boot_samples[0].head())

   normal_1  normal_2  uniform_1  uniform_2         y
0 -0.053783  5.429782   0.809397   2.762215  1.511815
1  1.059119  2.209087   0.226584   0.052329  0.500495
2  0.172031  2.133818   0.941006   2.475254  3.791499
3 -2.394260  4.193929   0.985119   2.191138  2.088677
4 -1.206998  4.882123   0.945837  -2.762352 -8.434375


Part 6

In [21]:
features = ["normal_1", "normal_2", "uniform_1", "uniform_2"]
target = "y"

def fit_ols_return_beta(df_sample, features, target="y"):
    X = df_sample[features].to_numpy()
    y = df_sample[target].to_numpy()
    # add intercept column
    X_aug = np.c_[np.ones(X.shape[0]), X]
    # OLS via least squares
    beta, residuals, rank, s = np.linalg.lstsq(X_aug, y, rcond=None)
    return beta  # [intercept, coef1, coef2, coef3, coef4]

# Fit per bootstrap sample
rows = []
for i, bdf in enumerate(boot_samples, start=1):
    beta = fit_ols_return_beta(bdf, features, target)
    row = {"replicate": i, "intercept": beta[0]}
    row.update({feat: beta[j+1] for j, feat in enumerate(features)})
    rows.append(row)

coef_boot = pd.DataFrame(rows)
print(coef_boot)

   replicate  intercept  normal_1  normal_2  uniform_1  uniform_2
0          1   0.589363  1.164715 -0.700149   0.513322   1.800083
1          2   0.530093  1.164982 -0.685513   0.500998   1.811133
2          3   0.529094  1.211673 -0.688826   0.542098   1.800773
3          4   0.586900  1.200677 -0.702407   0.537246   1.802607
4          5   0.532613  1.180780 -0.692151   0.539696   1.798151
5          6   0.545060  1.189225 -0.690864   0.511819   1.808773
6          7   0.562282  1.157778 -0.693906   0.528728   1.793894
7          8   0.571696  1.183093 -0.690911   0.469246   1.800420
8          9   0.584451  1.197121 -0.695892   0.484817   1.801039
9         10   0.551229  1.196622 -0.693327   0.518166   1.798855


Part 7

In [23]:
import pandas as pd

# coef_boot: columns like ['replicate','intercept','normal_1','normal_2','uniform_1','uniform_2', ...]
params = [c for c in coef_boot.columns if c != "replicate"]

boot_stats = (
    coef_boot[params]
    .agg(['mean', 'std'])
    .T
    .rename_axis('parameter')
    .reset_index()
)

print(boot_stats)

   parameter      mean       std
0  intercept  0.558278  0.024027
1   normal_1  1.184666  0.017735
2   normal_2 -0.693395  0.005061
3  uniform_1  0.514614  0.024151
4  uniform_2  1.801573  0.005021


Part 8

All coefficients are consistent with the bootstrap: four lie within 1 SD of the bootstrap mean (the fifth is ~1.06 SD away), and every estimate falls inside its 95% Confidence Interval. The bootstrap standard deviations are smal,especially for normal_2 and uniform_2 which implies precise estimates.