In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/dataset_5.csv')
plt.figure(figsize=(6,4))
plt.scatter(df['X'], df['Y'], alpha=0.7)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter plot of Y vs. X')
plt.show()

In [None]:
import statsmodels.api as sm

X_sm = sm.add_constant(df['X'])       # add intercept
model = sm.OLS(df['Y'], X_sm).fit()
print(model.summary())

In [None]:
plt.figure(figsize=(6,4))
plt.scatter(df['X'], df['Y'], alpha=0.7, label='Data')
# Create a smooth line over the range of X
x_vals = np.linspace(df['X'].min(), df['X'].max(), 100)
y_vals = model.params['const'] + model.params['X'] * x_vals
plt.plot(x_vals, y_vals, color='red', lw=2, label='OLS fit')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter of Y vs. X with OLS Regression Line')
plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Panel 1: residuals as vertical distances
axes[0].scatter(df['X'], df['Y'], alpha=0.7, label='Data')
axes[0].plot(x_vals, y_vals, color='red', lw=2, label='OLS fit')
for xi, yi, fi in zip(df['X'], df['Y'], model.fittedvalues):
    axes[0].vlines(xi, yi, fi, color='gray', alpha=0.5)
axes[0].set_title('Residuals as Vertical Distance')
axes[0].set_xlabel('X')
axes[0].set_ylabel('Y')
axes[0].legend()

# Panel 2: classic residuals vs. fitted plot
axes[1].scatter(model.fittedvalues, model.resid, alpha=0.7)
axes[1].axhline(0, color='red', lw=1)
axes[1].set_title('Residuals vs. Fitted')
axes[1].set_xlabel('Fitted values')
axes[1].set_ylabel('Residuals')

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
sm.qqplot(model.resid, line='45', fit=True, ax=ax, alpha=0.7)
plt.show()


In [None]:
n_boot = 1000
boot_params = np.zeros((n_boot, 2))   # columns: [intercept, slope]

for i in range(n_boot):
    idx = np.random.choice(df.index, size=n_boot, replace=True)
    Xb  = sm.add_constant(df.loc[idx, 'X'])
    yb  = df.loc[idx, 'Y']
    res = sm.OLS(yb, Xb).fit()
    boot_params[i] = res.params.values

ci_int = np.percentile(boot_params[:,0], [2.5, 97.5])
ci_slope = np.percentile(boot_params[:,1], [2.5, 97.5])

print(f"95% CI for intercept: {ci_int}")
print(f"95% CI for slope:     {ci_slope}")


In [None]:
if ci_slope[0] < 0 and ci_slope[1] > 0:
    print("95% CI for slope includes 0. No association concluded.")
else:
    print("95% CI for slope does not include 0. There is an association.")