In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [123]:
df = pd.read_csv(r'framingham_heart_disease.csv')
df = df[['male', 'age', 'BMI', 'heartRate', 'sysBP']]
original_size = df.shape[0]
df.dropna(how='any', inplace=True)
no_null_size = df.shape[0]
df.insert(loc=0, column='intercept', value=1)
print(f'Data size after dropping Null values is'
 f' {no_null_size / original_size * 100:0.3}% of the original data size')

Data size after dropping Null values is 99.5% of the original data size


In [124]:
def generate_sample(df, size):
    np.random.seed(555)
    flag = True
    sample = None
    while flag: #will take another sample if there is one sex only
        sample = df.sample(size,ignore_index=True)
        counts_gender = sample[['male', 'sysBP']].groupby('male').count()
        male_count = counts_gender['sysBP'][1]
        female_count = counts_gender['sysBP'][0]
        if female_count != 0 and male_count != 0:
            if abs(female_count - male_count) <= size * 0.1:
                flag = False
    return sample

sample = generate_sample(df, 200)
X_variables = ['intercept','age', 'BMI', 'heartRate']
y = 'sysBP'
X_sample = sample[X_variables]
y_sample = sample[y]

In [163]:
import scipy.stats as stats
z_alpha = stats.norm.ppf(0.975)
#Q1.a
def calculate_beta(X, y):
    # Calculate MLE
    C = X.T @ X
    C_inv = np.linalg.inv(C)
    C_inv_X = C_inv @ X.T
    beta = C_inv_X @ y
    return np.round(beta, 4).to_numpy()

def beta_CI(X, y, beta_sample, x_variables):
    n = X.shape[0]
    p = X.shape[1]
    C = np.linalg.inv(X.T @ X)
    e = y - np.dot(X,beta_sample) #residuals
    res_var_estimate = (1 / (n - p)) * (e.T @ e) #sigma-hat squared
    CI = {}
    for i, variable in enumerate(x_variables):
        std_estimate = np.sqrt(res_var_estimate * (C[i][i])) #SE of Beta_i
        CI[i] = np.round([beta_sample[i] - z_alpha * std_estimate,
                            beta_sample[i] + z_alpha * std_estimate], 4)
    return CI

In [171]:
beta_sample = calculate_beta(X_sample,y_sample)
CI = beta_CI(X_sample, y_sample, beta_sample,X_variables)
print(f'Beta = {beta_sample}')
for i in range(beta_sample.size):
    print(f"CI for beta_{i} is: {CI[i]}")


Beta = [41.7462  0.9039  0.785   0.3429]
CI for beta_0 is: [15.3282 68.1642]
CI for beta_1 is: [0.6169 1.1909]
CI for beta_2 is: [0.0966 1.4734]
CI for beta_3 is: [0.1438 0.542 ]


In [175]:
B = 400
sample_size = 200
# bootstrap_samples = []
bootstrap_beta_var = []
np.random.seed(555)
for i in range(B):
    idx_i = np.random.choice(np.arange(0,sample_size),sample_size)
    sample_i = sample.loc[idx_i]
    X_i = sample_i[X_variables]
    y_i = sample_i[y]
    bootstrap_beta = calculate_beta(X_i,y_i)
    # bootstrap_samples.append(sample_i)
    bootstrap_beta_var.append(bootstrap_beta)
beta_se = np.std(bootstrap_beta_var,axis=0)
CI = {}
for i in range(X_sample.shape[1]):
    CI[i] = np.round([beta_sample[i] - z_alpha * beta_se[i],
                        beta_sample[i] + z_alpha * beta_se[i]], 4)
    print(f"CI for beta_{i} is: {CI[i]}")

CI for beta_0 is: [20.7928 62.6996]
CI for beta_1 is: [0.6238 1.184 ]
CI for beta_2 is: [0.007 1.563]
CI for beta_3 is: [0.1655 0.5203]
