PART A

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv(r'framingham_heart_disease.csv')
df = df[['male', 'age', 'BMI', 'heartRate', 'sysBP']]
original_size = df.shape[0]
df.dropna(how='any', inplace=True)
no_null_size = df.shape[0]
df.insert(loc=0, column='intercept', value=1)
print(f'Data size after dropping Null values is'
 f' {no_null_size / original_size * 100:0.3}% of the original data size')

Data size after dropping Null values is 99.5% of the original data size


In [20]:
def generate_sample(df, size):
    np.random.seed(555)
    flag = True
    sample = None
    while flag: #will take another sample if there is one sex only
        sample = df.sample(size,ignore_index=True)
        counts_gender = sample[['male', 'heartRate']].groupby('male').count()
        male_count = counts_gender['sysBP'][1]
        female_count = counts_gender['sysBP'][0]
        if female_count != 0 and male_count != 0:
            if abs(female_count - male_count) <= size * 0.1:
                flag = False
    return sample

sample = generate_sample(df, 200)
X_variables = ['intercept','age', 'BMI', 'heartRate']
y_variable = 'sysBP'
X = df[X_variables]
y = df[y_variable]
X_sample = sample[X_variables]
y_sample = sample[y_variable]

In [21]:
import scipy.stats as stats
z_alpha = stats.norm.ppf(0.975)
#Q1.a
def calculate_beta(X, y):
    # Calculate MLE
    C = X.T @ X
    C_inv = np.linalg.inv(C)
    C_inv_X = C_inv @ X.T
    beta = C_inv_X @ y
    return np.round(beta, 4).to_numpy()

def beta_CI(X, y, beta_sample, x_variables):
    n = X.shape[0]
    p = X.shape[1]
    C = np.linalg.inv(X.T @ X)
    e = y - np.dot(X,beta_sample) #residuals
    res_var_estimate = (1 / (n - p)) * (e.T @ e) #sigma-hat squared
    CI = []
    for i, variable in enumerate(x_variables):
        std_estimate = np.sqrt(res_var_estimate * (C[i][i])) #SE of Beta_i
        CI.append(np.round([beta_sample[i] - z_alpha * std_estimate,
                            beta_sample[i] + z_alpha * std_estimate], 4))
    return CI

In [22]:
#Q1.a
beta_sample = calculate_beta(X_sample,y_sample)
regular_CI = beta_CI(X_sample, y_sample, beta_sample,X_variables)
print(f'Beta = {beta_sample}')
for i in range(beta_sample.size):
    print(f"CI for beta_{i} is: {regular_CI[i]}")


Beta = [41.7462  0.9039  0.785   0.3429]
CI for beta_0 is: [15.3282 68.1642]
CI for beta_1 is: [0.6169 1.1909]
CI for beta_2 is: [0.0966 1.4734]
CI for beta_3 is: [0.1438 0.542 ]


In [23]:
#Q1.b
B = 400
bootstrap_beta_hist = []
bootstrap_beta_var = []
np.random.seed(555)
for i in range(B):
    bootstrap_data = sample.sample(frac=1,replace=True) # sampling 200 rows with returns
    X_i = bootstrap_data[X_variables]
    y_i = bootstrap_data[y_variable]
    bootstrap_beta = calculate_beta(X_i,y_i)
    bootstrap_beta_hist.append(bootstrap_beta)
    bootstrap_beta_var.append(bootstrap_beta)
beta_se = np.std(bootstrap_beta_var,axis=0)
normal_CI = []
for i in range(X_sample.shape[1]):
    CI = np.round([beta_sample[i] - z_alpha * beta_se[i],
                        beta_sample[i] + z_alpha * beta_se[i]], 4)
    normal_CI.append(CI)
    print(f"Normal based CI for beta_{i} is: {CI}")

Normal based CI for beta_0 is: [20.7928 62.6996]
Normal based CI for beta_1 is: [0.6238 1.184 ]
Normal based CI for beta_2 is: [0.007 1.563]
Normal based CI for beta_3 is: [0.1655 0.5203]


In [24]:
#Q1.c
bootstrap_beta_hist = np.array(bootstrap_beta_hist)
pivotal_CI = []
for i,(beta_est,beta_bootstrap) in enumerate(zip(beta_sample,bootstrap_beta_hist.T)):
    beta_quantiles = np.quantile(beta_bootstrap,[0.025,0.975])
    CI = np.round([2*beta_est - beta_quantiles[1], 2*beta_est - beta_quantiles[0]],4)
    pivotal_CI.append(CI)
    print(f"Pivotal CI for beta_{i} is: {CI}")

Pivotal CI for beta_0 is: [22.9332 63.2129]
Pivotal CI for beta_1 is: [0.6028 1.1811]
Pivotal CI for beta_2 is: [0.0458 1.5722]
Pivotal CI for beta_3 is: [0.1641 0.4998]


In [25]:
#Q1.d
quantile_CI = []
for i,(beta_est,beta_bootstrap) in enumerate(zip(beta_sample,bootstrap_beta_hist.T)):
    beta_quantiles = np.quantile(beta_bootstrap,[0.025,0.975])
    CI = np.round([beta_quantiles[0], beta_quantiles[1]],4)
    quantile_CI.append(CI)
    print(f"Pivotal CI for beta_{i} is: {CI}")

Pivotal CI for beta_0 is: [20.2795 60.5592]
Pivotal CI for beta_1 is: [0.6267 1.205 ]
Pivotal CI for beta_2 is: [-0.0022  1.5242]
Pivotal CI for beta_3 is: [0.186  0.5217]


In [26]:
#Q2
regular_CI_lengths = [ci[1]-ci[0] for ci in regular_CI]
normal_CI_lengths = [ci[1]-ci[0] for ci in normal_CI]
pivotal_CI_lengths = [ci[1]-ci[0] for ci in pivotal_CI]
quantile_CI_lengths = [ci[1]-ci[0] for ci in quantile_CI]
data = list(zip(regular_CI_lengths,normal_CI_lengths,pivotal_CI_lengths,quantile_CI_lengths))
CI_lengths_comparison = pd.DataFrame(data,index=['regular','normal','pivotal','quantile'],columns=['beta 0','beta 1', 'beta 2', 'beta 3'])

In [27]:
def CI_contains(CI,type,actual_betas):
    for i,ci in enumerate(CI):
        decision = 'contains' if ci[0]<=actual_betas[i]<=ci[1] else "doesn't contain"
        print(f'the {type} CI {list(ci)} {decision} beta_{i} = {actual_betas[i]}')

In [28]:
actual_betas = calculate_beta(X,y)
print("Regular CI")
CI_contains(regular_CI,'regular',actual_betas)
print("\nNormal CI")
CI_contains(normal_CI,'normal',actual_betas)
print("\nPivotal CI")
CI_contains(pivotal_CI,'pivotal',actual_betas)
print("\nQuantile CI")
CI_contains(quantile_CI,'quantile',actual_betas)

Regular CI
the regular CI [15.3282, 68.1642] contains beta_0 = 26.1297
the regular CI [0.6169, 1.1909] contains beta_1 = 0.9203
the regular CI [0.0966, 1.4734] contains beta_2 = 1.4356
the regular CI [0.1438, 0.542] contains beta_3 = 0.3102

Normal CI
the normal CI [20.7928, 62.6996] contains beta_0 = 26.1297
the normal CI [0.6238, 1.184] contains beta_1 = 0.9203
the normal CI [0.007, 1.563] contains beta_2 = 1.4356
the normal CI [0.1655, 0.5203] contains beta_3 = 0.3102

Pivotal CI
the pivotal CI [22.9332, 63.2129] contains beta_0 = 26.1297
the pivotal CI [0.6028, 1.1811] contains beta_1 = 0.9203
the pivotal CI [0.0458, 1.5722] contains beta_2 = 1.4356
the pivotal CI [0.1641, 0.4998] contains beta_3 = 0.3102

Quantile CI
the quantile CI [20.2795, 60.5592] contains beta_0 = 26.1297
the quantile CI [0.6267, 1.205] contains beta_1 = 0.9203
the quantile CI [-0.0022, 1.5242] contains beta_2 = 1.4356
the quantile CI [0.186, 0.5217] contains beta_3 = 0.3102


In [29]:
#Q3 - new sample
df_new = df.merge(sample, how='left', indicator=True)
df_new = df_new[df_new['_merge'] == 'left_only']
new_sample = generate_sample(df_new,100)
new_X_sample = new_sample[X_variables]
new_y_sample = new_sample[y_variable]

In [30]:
#Q3.a - prediction
new_beta_sample = calculate_beta(new_X_sample,new_y_sample)
new_y_pred = np.matmul(new_X_sample, new_beta_sample)

In [31]:
#Q3.b - CI for E[y_new|x_new]
B = 400
prediction_hist = []
for i in range(B):
    bootstrap_data = new_sample.sample(frac=1,replace=True) # sampling 200 rows with returns
    X_i = bootstrap_data[X_variables]
    y_i = bootstrap_data[y_variable]
    bootstrap_beta = calculate_beta(X_i,y_i)
    bootstrap_y_pred = np.matmul(X_i,bootstrap_beta)
    prediction_hist.append(bootstrap_y_pred)

prediction_hist = np.array(prediction_hist)
pred_se = np.std(prediction_hist,axis=0)
pred_mean = np.mean(prediction_hist,axis=0)
normal_CI = []
for i in range(prediction_hist.shape[1]):
    CI = np.round([pred_mean[i] - z_alpha * pred_se[i],
                        pred_mean[i] + z_alpha * pred_se[i]], 4)
    normal_CI.append(CI)


In [32]:
#Q3.c
actual_confidence = np.mean([ int(ci[0]<=pred<=ci[1])
                              for ci,pred in zip(normal_CI,new_y_pred)])
print(f'{actual_confidence}% of the Bootstraped-CIs contain the predicted value of Ynew,\n'
      f'which is {np.abs(actual_confidence-0.95):.3} far from the desired confidence level of 95%')

0.96% of the Bootstraped-CIs contain the predicted value of Ynew,
which is 0.01 far from the desired confidence level of 95%


#Q3.d<br>
We will use the formula from Tirgul 5:<br>
$CI(Y_{new}|x_{new})=\hat{Y}_{new}\ \pm\ Z_{\frac{\alpha}{2}}\sqrt{\hat{\sigma^2_{\epsilon}} \cdot X_{new} \cdot C \cdot X_{new}^T\ + \hat{\sigma^2_{\epsilon}}}$<br>
So, we will calculate $\hat{\sigma^2_{\epsilon}}$ within every bootstrap sample(aka the noise variance) and add it to the variance of the prediction $Var(\hat{Y}_{new})$.<br>
Normal estimation is still valid (as we used it in the CI of $E[\hat{Y}_{new}|\hat{x}_{new}]$) because the "noise" is normally distributed too.

!!!!!!! Part B !!!!!!!

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat

In [None]:
#we've dropped 'education' feature (irrelevant for our questions)
#we've dropped 'glucose' feature since it had too many NaN values
#we've dropped all NaN containing samples and left with 4088 samples
df = pd.read_csv(r'/Users/avishagnevo/Desktop/ex1_stats2/framingham_heart_disease.csv')
df.drop(['glucose', 'education', 'prevalentHyp', 'prevalentStroke'], inplace=True, axis=1)
df.dropna(how='any', inplace=True)

In [None]:
#### Distribution visualization  ####
#does heart rate measurements effect on 10-year risk of coronary heart disease varies between sex
sns.pairplot(data=df, x_vars=['heartRate'], y_vars=['TenYearCHD'], hue='male', kind='reg',
             markers=['o', 'x'], height=2, aspect=3)
plt.title("Linear Regression\nheart rate effect on 10-year risk of coronary heart disease varies between sex")

In [None]:
flag=True
while flag: #will take another sample if there is one sex only
    sample = df.sample(200)
    counts_gender = sample[['male', 'heartRate']].groupby('male').count()
    counts_chances = sample[['TenYearCHD', 'heartRate']].groupby('TenYearCHD').count()
    #taking care that there are females and males +
    #more than 1/5 of the people that has 10-year risk of coronary heart disease
    if counts_gender['heartRate'][0] != 0 and counts_gender['heartRate'][1] != 0 and counts_chances['heartRate'][1] > 40:
        flag = False
sample_var = sample[['male', 'heartRate']].groupby('male').var()
sample_var1 = sample_var['heartRate'][0] #woman
sample_var2 = sample_var['heartRate'][1] #man

mean_gender = sample[['male', 'heartRate']].groupby('male').mean()
mu1 = mean_gender['heartRate'][0] #woman
mu2 = mean_gender['heartRate'][1] #man
delta = mu1 - mu2

n_gender = sample[['male', 'heartRate']].groupby('male').count()
n1 = n_gender['heartRate'][0] #woman
n2 = n_gender['heartRate'][1] #man

B.3.a:
as the average being an estimator of the expected value-
$\hat{\mu_1} = \overline{X}_{woman}$
$\hat{\mu_2} = \overline{X}_{man}$
Define $X = X_{woman} - X_{man}$,
MLE estimator for the expected value is $\delta = \overline{X} = \hat{E}(X) = \hat{E}(X_{woman} - X_{man}) =  \hat{E}(X_{woman}) - \hat{E}(X_{man}) \rightarrow$ so $\hat{\delta} = \hat{\mu_1} - \hat{\mu_2} = \overline{X}_{woman} - \overline{X}_{man}$

B.3.b:
Define $S^2_p = \frac{(n_{woman} - 1)S^2_{woman} + (n_{man} - 1)S^2_{man}}{n_{woman} + n_{man} -2}$

CI for the expected value of $E(X) = E(X_{woman} - X_{man})$ is $[\hat{\delta} - t_{\frac{\alpha}{2}} S_p\sqrt{\frac{1}{n_{woman}} + \frac{1}{n_{man}}}$ , $\hat{\delta} + t_{1- \frac{\alpha}{2}} S_p\sqrt{\frac{1}{n_{woman}} + \frac{1}{n_{man}}}]$
We'll conduct normality visual check, also we'll conduct F test to check variances equality: $$
H0 : \sigma_{woman} = \sigma_{man} \\
H1 : \sigma_{woman} \neq \sigma_{man} \\
T.S. : F = \frac{S_{man}^2}{S_{woman}^2} \\
RR : F > F_{(n_{man}-1 , n_{woman}-1),0.95}
$$


In [None]:
#CI
t_quantile = 1.96 #1-a
Sp_2 = (((n1 - 1) * sample_var1) + ((n2 - 1) * sample_var2)) / (n1 + n2 - 2)
t_CI = (delta - t_quantile*np.sqrt(Sp_2*(1/n1 + 1/n2)),
             delta + t_quantile*np.sqrt(Sp_2*(1/n1 + 1/n2)))
print(f'CI for the expected values difference is:\n({t_CI[0]:.03} , {t_CI[1]:.03})')

t_test_stat = delta / (np.sqrt(Sp_2*(1/n1 + 1/n2)))
t_RR = np.absolute(t_test_stat) > t_quantile
t_pvalue = 1-stat.t.cdf(7.39,df=4086)+stat.t.cdf(-7.39,df=4086)

# normalization check using histograms
sns.histplot(sample[sample['male']==0]['heartRate'], color="orange", bins=25)
sns.histplot(sample[sample['male']==1]['heartRate'], color="blue", bins=25)
plt.legend(['female', 'male'])
plt.title("Heart rate distribution of male and female groups")

# F testing
F = sample_var1/sample_var2
F_quantile = stat.f.ppf(0.95, n1, n2)
print('F test results:')
print(f'F = {F:.03}, and as the 0.95 quantile of F distibution with ({n1},{n2}) df = {F_quantile:.03} we {"reject the null hypothesis" if F>F_quantile else "do not reject the null hypothesis"}')

B.3.c:

In [None]:
mean_gender_all = df[['male', 'heartRate']].groupby('male').mean()
mu1_all = mean_gender_all['heartRate'][0] #woman
mu2_all = mean_gender_all['heartRate'][1] #man
delta_all = mu1_all - mu2_all
print(f'{"The samples CI includes the expected values difference based on the whole sample" if (t_CI[0] < delta_all < t_CI[1]) else "The samples CI does not include the expected values difference based on the whole sample"} ')

B.3.d:
Wald test :
$$
H0 : \delta = 0 \\
H1 : \delta \neq 0 \\
T.S. : W = \frac{\hat{\delta}-\delta_0}{\hat{se}} \\
R.R. : |W| > z_{\frac{\alpha}{2}}
$$
Permutation test :
$$
H0 : F_{X_{woman}} = F_{X_{man}} \\
H1 : F_{X_{women}} \succ F_{X_{men}} \\
T.S. : T_0 = \overline{X}_{women} - \overline{X}_{men} \\
R.R. : \frac{1}{N \choose n_{woman}} \sum_{i=1}^{N \choose n_{woman}}I\{T_i \geq T_0\} < \alpha
$$


In [None]:
#Wald test
W = delta / np.sqrt(Sp_2)
z_quantile = stat.norm.ppf(0.975)
print(f'{"We reject the null hypothesis" if np.abs(W) > z_quantile else "We do not reject the null hypothesis"} with cl of 0.95')

In [None]:
#permutations test
B=1000
bootstrap =[]
for _ in range(B):
    sample1 = sample.sample(n1, replace=False)
    sample2 = sample.drop(sample1.index)
    bootstrap.append(sample1['heartRate'].mean() - sample2['heartRate'].mean())
alpha = np.sum(np.array([1 if delta<bootstrap[i] else 0 for i in range(len(bootstrap))]))/B
print(f'We got that the expected values difference is the {int((1-alpha)*200)} order statistic out of 200,\n'
      f'for pvalue = {alpha} > 0.05 we do not reject the null hypothesis for cl of 0.95')

B.4.a:
The medians difference is the MLE of a median of $X_{women} - X_{men}$
Note that we assume that the population distribution is Gaussian to a sufficiently accurate approximation, for that some of its parameters are equal, e.g., the mean, median, and mode all have the same value (different definitions, but same value).
Since the MLE of the mean difference of women and mens is the sample's mean difference so it is also the MLE of the median, and since the median of each sample is equal to its mean (for large n), the sample's median difference is the MLE of the sample's median.

In [None]:
median1, median2 = sample[['male', 'heartRate']].groupby('male').quantile()['heartRate']
delta = mu1 - mu2

B.4.b:
Define the 50% quantile (=median) of the general distribution to be M.
CI for the M's position would be $CI=[np - z_{1-\frac{\alpha}{2}} \sqrt{np(1-p)}, np - z_{1-\frac{\alpha}{2}} \sqrt{np(1-p)}]$
Proof : define $I \sim Ber(p)$, under the null hypothesis, $p=\frac{1}{2}$ which means that M is the median,
under the null hypothesis if M's the median then for n big enough, the middle position of the ordered sample should be held by M.
We want to show that the $P_{H0}(\frac{n}{2} \in [np - z_{1-\frac{\alpha}{2}} \sqrt{np(1-p)}, np - z_{1-\frac{\alpha}{2}} \sqrt{np(1-p)}]) \approx 1-\alpha$
$P_{H0}(\frac{n}{2} \in [np - z_{1-\frac{\alpha}{2}} \sqrt{np(1-p)}, np - z_{1-\frac{\alpha}{2}} \sqrt{np(1-p)}])$ =
$P_{H0}(\frac{1}{2} \in [p - z_{1-\frac{\alpha}{2}} \sqrt{\frac{p(1-p)}{n}}, p + z_{1-\frac{\alpha}{2}} \sqrt{\frac{p(1-p)}{n}}])$ =
$P_{H0}(\frac{p-\frac{1}{2}}{\sqrt{\frac{p(1-p)}{n}}} \in [z_{\frac{\alpha}{2}}, z_{1-\frac{\alpha}{2}}]) = 1-\alpha$
Since $p = \overline{I}_n$ from Central Limit Theorem $\frac{\overline{I}_n-\frac{1}{2}}{\sqrt{\frac{p(1-p)}{n}}} \sim N(0,1)$.

In [None]:
#using Mann-Whitney here
diffs = []
for x_women in sample[sample['male']==0]['heartRate']:
    for x_men in sample[sample['male']==1]['heartRate']:
        diffs.append(x_women - x_men)
p = np.sum(np.array([1 if delta>=diffs[i] else 0 for i in range(len(bootstrap))]))/B
CI = [int(p*(n1+n2)-z_quantile*np.sqrt(p*(1-p)*(n1+n2))), int(p*(n1+n2)+z_quantile*np.sqrt(p*(1-p)*(n1+n2)))]
print(f'The CI for the placement of the median in a sample sized 200 is {CI}')

B.4.c:

In [None]:
median_all = df['heartRate'].median()
heartRates_sorted = np.sort(df['heartRate'].to_numpy())
placements = [i for i in range(len(df['heartRate'])) if heartRates_sorted[i] == 75.0]
n_whole = df.shape[0]
CI_whole = [int(CI[0]*n_whole/(n1+n2)), int(CI[1]*n_whole/(n1+n2))]
print(f'The median based on the whole sample is {median_all}')
print(f'The CI for the median position (proportionally to the whole samples size) is {CI_whole}')
print(f'{"The median position is in the samples positions CI" if heartRates_sorted[CI_whole[0]]<=median_all<=heartRates_sorted[CI_whole[1]] else "The median position is not in the samples positions CI"}')

B.4.d:
Permutation test :
$$
H0 : F_{X_{woman}} = F_{X_{man}} \\
H1 : F_{X_{women}} \succ F_{X_{men}} \\
T.S. : T_0 = M_{women} - M_{men} \\
R.R. : \frac{1}{N \choose n_{woman}} \sum_{i=1}^{N \choose n_{woman}}I\{T_i \geq T_0\} < \alpha
$$



In [None]:
bootstrap_median =[]
for _ in range(B):
    sample1 = sample.sample(n1, replace=False)
    sample2 = sample.drop(sample1.index)
    bootstrap_median.append(sample1['heartRate'].quantile() - sample2['heartRate'].quantile())
alpha = np.sum(np.array([1 if delta<bootstrap_median[i] else 0 for i in range(len(bootstrap_median))]))/B
print(f'We got that the expected values difference is the {int((1-alpha)*200)} order statistic out of 200,\n'
      f'for pvalue = {alpha} < 0.05 we reject the null hypothesis for cl of 0.95')

B.4.e:
t-test :
t-test is for expected values (or the difference between them) and not for medians diffrence, hence that we cannot use it.
Wald test:
We could use Wald test (always as the data is gaussian distributed)

In [None]:
#showing that the expected values differences is gaussian distributed
sns.histplot(bootstrap, color="blue", bins=17)
plt.title("Bootstrap Heart rate differences distribution of male and female groups")

Wilcoxon rank-sum test :
$$
H0 : F_{X_{woman}} = F_{X_{man}} \\
H1 : F_{X_{women}} \succ F_{X_{men}} \\
T.S. : T = \frac{W_S - E[W_S]}{\sqrt{(Var(W_S))}} \quad (W_S = \sum_{i=1}^{n_{women}} S_i ,\quad E(W_S) = \frac{n_{women}(N+1)}{2}), \quad Var(W_S) = \frac{n_{women}n_{men}(N+1)}{12}) \\
R.R. : T \geq z_{\alpha} \ or \ -z_{\alpha} \geq T \quad replaces \quad\{s : P(W_S \geq s) \leq \alpha\}
$$
we can use normal approximation for $W_S$ since under H0 the T.S. is normally distributed
When comparing the results of Wilcoxon test and the test based on Resampling, we get the same results basically (which confirms the hypothesis the Wilcoxon T.S. being asymptotically gaussian distributed).

In [None]:
def ties_handling(order , data):
    values, counts = np.unique(data, return_counts=True)
    for i in range(len(counts)):
        start, end = degrees_sum(counts,i)
        if end==202:
            end=200
        if end>200:
            return order
        avg_order = (order[start]+order[end-1])/2
        for j in range(start,end):
            order[j] = avg_order

def degrees_sum(counts, index):
    start = int(np.sum([counts[i] for i in range(index)]))
    return start, start+counts[index]

sample_order = np.argsort(list(sample['heartRate']))+1
order_stats_sorted = ties_handling(list(range(1,sample.shape[0]+1)), heartRates_sorted)
sample['orderStats'] = [x for _,x in sorted(zip(sample_order,heartRates_sorted))]

#Wilcoxon
W_S = sample[['male', 'orderStats']].groupby('male').sum()['orderStats'][0]
W_S_E = n1*(n1+n2+1)/2
W_S_Var = n1*n2*(n1+n2+1)/12
T = (W_S - W_S_E) / np.sqrt(W_S_Var)
print(f'Wilcoxon test:\n'
      f'We reject the null hypothesis for cl 0.95 since T = {T:.03} < {-z_quantile:.03} = z ')

#Resampeling
bootstrap_sum =[]
for _ in range(B):
    sample1 = sample.sample(n1, replace=False)
    bootstrap_sum.append(sample1['orderStats'].sum())
alpha = np.sum(np.array([1 if W_S<bootstrap_sum[i] else 0 for i in range(len(bootstrap_sum))]))/B
print(f'Approximation using Resampling:\n'
      f'We got that the expected values difference is the {int((1-alpha)*200)} order statistic out of 200,\n'
      f'for pvalue = {alpha} < 0.05 thus we do not reject the null hypothesis for cl of 0.95')

B.6.
There's a slight positive statistics difference between the female heart rates and male heart rates, in all kinds of statistics, specially median-wise and degrees sum wise, we guess that we haven't got to reject the null hypothesis when the T.S. was the averaged difference since when looking at the distributions there's a minor difference between the two categories heart rates mean so it is not significant enough to reject the null hypothesis (usually, permutation test is good for extreme statistics results). Looking at the original distributions, we can see that right tail of women heart rate is much longer than the men one, therefore it is reasonable that we got to reject the null hypothesis at almost any of the permutation tests we've considered.