In [14]:
import pandas as pd
import statsmodels.api as sm

csv_path = "data-table-B5.csv" # given by problem
df = pd.read_csv(csv_path)

# select 20 rows at random per problem
sample_df = df.sample(n=20, random_state=12345678).sort_index() # for reproducibility
print("Selected rows (1-based):", (sample_df.index + 1).tolist())
print(sample_df)

Selected rows (1-based): [1, 2, 4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 19, 21, 22, 23, 24, 25, 26, 27]
        y    x1   x2     x3     x4       x5       x6    x7
0   36.98   5.1  400  51.37   4.24  1484.83  2227.25  2.06
1   13.74  26.4  400  72.33  30.87   289.94   434.90  1.33
3    8.53  46.4  400  79.15  44.61   164.76   247.14  0.62
5   26.59  12.6  450  89.90  41.26   605.06   907.59  0.76
6   19.07  18.9  450  91.48  41.88   405.37   608.05  1.71
7    5.96  30.2  450  98.60  70.79   253.70   380.55  3.93
8   15.52  53.8  450  98.05  66.82   142.27   213.40  1.97
9   56.61   5.6  400  55.69   8.92  1362.24  2043.36  5.08
10  26.72  15.1  400  66.29  17.98   507.65   761.48  0.60
12   6.99  48.4  400  74.74  33.94   158.05   237.08  0.63
13  45.93   5.8  425  63.71  11.95   130.66  1961.49  2.04
14  43.09  11.2  425  67.14  14.73   682.59  1023.89  1.57
18  26.14  16.7  450  83.88  26.33   458.42   687.62  8.82
20  11.63  24.9  450  79.77  25.66   307.08   460.62  1.72
21   9.59  39.5 

In [15]:
# full model
y = sample_df['y']
X_full = sm.add_constant(sample_df[['x6', 'x7']])
model_full = sm.OLS(y, X_full).fit()

# for parts a-d (textbook problem)
print(model_full.summary())

R2_full = model_full.rsquared
R2adj_full = model_full.rsquared_adj
F_full = model_full.fvalue
p_full = model_full.f_pvalue
print(f"\nFull model: R^2 = {R2_full:.4f}, Adjusted R^2 = {R2adj_full:.4f}")
print(f"F({int(model_full.df_model)}, {int(model_full.df_resid)}) = {F_full:.4f}, p = {p_full:.4e}")

print("\nt-tests for coefficients:")
for var in ['x6', 'x7']:
    tval = model_full.tvalues[var]
    pval = model_full.pvalues[var]
    est  = model_full.params[var]
    se   = model_full.bse[var]
    print(f"{var}: estimate = {est:.4f}, SE = {se:.4f}, t = {tval:.3f}, p = {pval:.4e}")

ci_full = model_full.conf_int(alpha=0.05)
print("\n95% CI for beta_6 and beta_7:")
print(f"beta_6: [{ci_full.loc['x6', 0]:.4f}, {ci_full.loc['x6', 1]:.4f}]")
print(f"beta_7: [{ci_full.loc['x7', 0]:.4f}, {ci_full.loc['x7', 1]:.4f}]")


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.788
Model:                            OLS   Adj. R-squared:                  0.763
Method:                 Least Squares   F-statistic:                     31.55
Date:                Fri, 06 Jun 2025   Prob (F-statistic):           1.90e-06
Time:                        10:06:07   Log-Likelihood:                -71.477
No. Observations:                  20   AIC:                             149.0
Df Residuals:                      17   BIC:                             151.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1624      3.965      0.293      0.7

In [16]:
# simple model: y ~= x_6
X_x6 = sm.add_constant(sample_df[['x6']])
model_x6 = sm.OLS(y, X_x6).fit()

# for part E
R2_x6 = model_x6.rsquared
R2adj_x6 = model_x6.rsquared_adj
F_x6 = model_x6.fvalue
p_x6 = model_x6.f_pvalue
print(model_x6.summary())
print(f"Simple model (y ~= x_6): R^2 = {R2_x6:.4f}, Adjusted R^2 = {R2adj_x6:.4f}")
print(f"F({int(model_x6.df_model)}, {int(model_x6.df_resid)}) = {F_x6:.4f}, p = {p_x6:.4e}")

# for f: 95% CI for beta_6 in simple model
ci_x6 = model_x6.conf_int(alpha=0.05).loc['x6']
print(f"95% CI for beta_6 (simple): [{ci_x6[0]:.4f}, {ci_x6[1]:.4f}]")

# for g: compare MS_res
MS_res_full = model_full.mse_resid
MS_res_x6 = model_x6.mse_resid

print(f"\nMS_res (full model)   = {MS_res_full:.4f}")
print(f"MS_res (simple model) = {MS_res_x6:.4f}")


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.739
Method:                 Least Squares   F-statistic:                     54.81
Date:                Fri, 06 Jun 2025   Prob (F-statistic):           7.26e-07
Time:                        10:06:07   Log-Likelihood:                -73.003
No. Observations:                  20   AIC:                             150.0
Df Residuals:                      18   BIC:                             152.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.3308      3.654      1.185      0.2

The drop in $R^2$ from $0.7878 \to 0.7528$ (about $0.035$) is quite small, and adjusted $R^2$ falls only from $0.7628 \to 0.7390$. Adding $x_7$ to $x_6$ explains an extra $\sim 2\%$ of the variation in $y$, and $x_7$’s own $t$-test was not significant. $x_6$ alone captures the majority of explained variance and $x_7$’s coefficient is not significant, so the simpler model ($y \sim x_6$) is quite reasonable.

The two interval lengths ($0.0127$ vs. $0.0129$) are nearly identical. When $x_7$ is dropped from the model, the uncertainty about the slope of $x_6$ does not increase meaningfully. $x_7$ contributes relatively little to explaining $y$ once $x_6$ is in the model, because if $x_7$ had a substantial conditional effect, then removing it should have widened $\beta_6$’s confidence band by appreciably more.

Since removing $x_7$ raises the residual variance from about $87.56 \to 96.33$, an increase of roughly $8.77$, dropping $x_7$ makes the mean-squared error grow by about 10\%. Because $x_7$ was not significant, this modest rise in $MS_{\text{res}}$ confirms that $x_7$’s conditional contribution is minor. $x_6$ alone explains most of the variation in $y$, so the simpler one-predictor model is both statistically defensible and substantially more efficient.