In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
data_path = "../data/ANSUR II MALE Public.csv"
data = pd.read_csv(data_path, encoding='ISO-8859-1')

In [5]:
# Extract the height and weight columns
height = data['Heightin']
weight = data['Weightlbs']

In [23]:
# Compute means
mean_height = height.mean()
mean_weight = weight.mean()

print("Mean Height(in):", mean_height)
print("Mean Weight(lbs):", mean_weight)

Mean Height(in): 70.03576678098972
Mean Weight(lbs): 188.0048995590397


In [27]:
# Compute variances
var_height = height.var()
var_weight = weight.var()

print("Variance Height(in):", var_height)
print("Variance Weight(lbs):", var_weight)

Variance Height(in): 8.687277150202279
Variance Weight(lbs): 896.5155359002252


In [13]:
# Compute correlation coefficient
correlation = height.corr(weight)
print("Correlation coefficient:", correlation)

Correlation coefficient: 0.45291076161399163


In [15]:
# Compute slope (beta)
beta = correlation * (var_weight/var_height)**0.5
# Compute intercept (alpha)
alpha = mean_weight - beta * mean_height
print(f"OLS Estimator: \nSlope (beta) = {beta}, \nIntercept (alpha) = {alpha}")

OLS Estimator: 
Slope (beta) = 4.600973284362776, 
Intercept (alpha) = -134.22779235015594


In [16]:
# Compute predicted weight (y_hat)
y_hat = beta * height + alpha
# Compute residuals (actual weight - predicted weight)
residuals = weight - y_hat
# Compute the sample covariance between height and residuals
covariance = ((height - mean_height) * residuals).mean()
# Output the covariance
print(f"Sample covariance between height and residuals: {covariance}")

Sample covariance between height and residuals: -2.6012617940222777e-14


The result is effectively zero (very close to machine precision), confi rming that the residuals are uncorrelated with heightas expected in OLS regression.

In [18]:
# Compute variances
var_y = weight.var()
# Variance of actual weight
var_y_hat = y_hat.var()
# Variance of predicted weight (OLS estimator)
var_residuals = residuals.var() # Variance of residuals

# Output the variances
print("Sample variance of actual weight:", var_y)
print("Variance of OLS estimator (predicted weight):", var_y_hat)
print("Variance of residuals:", var_residuals)

Sample variance of actual weight: 896.5155359002252
Variance of OLS estimator (predicted weight): 183.900580484835
Variance of residuals: 712.6149554153903


The relationship is that Var(Y) â‰ˆ Var(Y_hat) + Var(residual)

In [20]:
# Compute the coefficient of determination (R^2)
R_squared = var_y_hat / var_y
# Compute squared correlation coefficient (r^2)
corr_squared = correlation ** 2
# Output the results
print("Sample coefficient of determination (R^2):", R_squared)
print("Squared sample correlation coefficient:", corr_squared)

Sample coefficient of determination (R^2): 0.20512815798576592
Squared sample correlation coefficient: 0.20512815798576595
