<a href="https://colab.research.google.com/github/virajvaidya/SRC_ANU_Sampling_1/blob/main/HomeworkW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import statistics as st
import itertools as it
from matplotlib import pyplot as plt
from scipy.stats import norm

In [3]:
population = pd.read_excel('/content/drive/MyDrive/Sampling I - HW3 data.xlsx')

# I have combined the analysis of Q1 a,b,c, d into one function for ease of calculation and so as to not repeat the steps.

In [8]:
def stratified_estimates(file_path):
    # Load the dataset
    df = pd.read_excel('/content/drive/MyDrive/Sampling I - HW3 data.xlsx')

    # Population sizes for each stratum (Nh)
    Nh = np.array([2319, 2583, 2265, 1284, 849])
    N = np.sum(Nh)  # Total population size

    # Sample sizes per stratum
    nh = df['STRATUM'].value_counts().sort_index().values  # Get sample sizes from data

    # Compute weighted mean BMI
    mean_BMI_stratum = df.groupby('STRATUM')['BMI'].mean().values
    weighted_mean_BMI = np.sum((Nh / N) * mean_BMI_stratum)

    # Compute standard error of mean BMI
    var_BMI_stratum = df.groupby('STRATUM')['BMI'].var().values
    SE_BMI = np.sqrt(np.sum((Nh / N) ** 2 * (var_BMI_stratum / nh)))

    # 95% confidence interval for BMI
    z = norm.ppf(0.975)  # Z-score for 95% CI
    CI_BMI = (weighted_mean_BMI - z * SE_BMI, weighted_mean_BMI + z * SE_BMI)

    # Estimate proportion of women ever pregnant
    prop_EVERPREG_stratum = df.groupby('STRATUM')['EVERPREG'].mean().values
    weighted_prop_EVERPREG = np.sum((Nh / N) * prop_EVERPREG_stratum)

    # Standard error for proportion
    SE_EVERPREG = np.sqrt(np.sum((Nh / N) ** 2 * (prop_EVERPREG_stratum * (1 - prop_EVERPREG_stratum) / nh)))

    # Compute design effects
    SRS_SE_BMI = np.std(df['BMI']) / np.sqrt(len(df))  # SRS standard error for BMI
    DEFF_BMI = (SE_BMI ** 2) / (SRS_SE_BMI ** 2)

    SRS_SE_EVERPREG = np.sqrt(weighted_prop_EVERPREG * (1 - weighted_prop_EVERPREG) / len(df))  # SRS standard error for proportion
    DEFF_EVERPREG = (SE_EVERPREG ** 2) / (SRS_SE_EVERPREG ** 2)

    # Compute proportionate allocation across strata
    nh_prop = (Nh / N) * len(df)  # Proportionate allocation sample sizes
    SE_BMI_prop = np.sqrt(np.sum((Nh / N) ** 2 * (var_BMI_stratum / nh_prop)))  # SE under proportionate allocation
    DEFF_BMI_prop = (SE_BMI_prop ** 2) / (SRS_SE_BMI ** 2)  # Design effect under proportionate allocation

    return {
        'Mean BMI': weighted_mean_BMI,
        'SE BMI': SE_BMI,
        '95% CI BMI': CI_BMI,
        'Proportion EVERPREG': weighted_prop_EVERPREG,
        'SE EVERPREG': SE_EVERPREG,
        'Design Effect BMI': DEFF_BMI,
        'Design Effect EVERPREG': DEFF_EVERPREG,
        'Proportionate Allocation': nh_prop,
        'SE BMI Proportionate Allocation': SE_BMI_prop,
        'Design Effect BMI Proportionate Allocation': DEFF_BMI_prop
    }

# Example usage (assume the file is in the current directory)
file_path = 'Sampling I - HW3 data.xlsx'
results = stratified_estimates(file_path)
print(results)


{'Mean BMI': 27.45625, 'SE BMI': 1.3446294414233448, '95% CI BMI': (24.820824722258035, 30.091675277741967), 'Proportion EVERPREG': 0.6285483870967742, 'SE EVERPREG': 0.07931845450395476, 'Design Effect BMI': 1.3816409765079491, 'Design Effect EVERPREG': 1.0778728022747273, 'Proportionate Allocation': array([ 9.97419355, 11.10967742,  9.74193548,  5.52258065,  3.6516129 ]), 'SE BMI Proportionate Allocation': 1.222488667688469, 'Design Effect BMI Proportionate Allocation': 1.1420356381190746}



#Mean BMI: 27.45625
#SE BMI: 1.3446294414233448
#95% CI BMI: Lower 24.820824722258035, Upper 30.091675277741967
#Proportion EVERPREG: 0.6285483870967742 (62.85%)
#SE EVERPREG: 0.07931845450395476
#Design Effect BMI: 1.3816409765079491
#Design Effect EVERPREG: 1.0778728022747273
#Proportionate Allocation: Stratum 1: 9.97419355, Stratum 2: 11.10967742, Stratum 3:  9.74193548, Stratum 4:  5.52258065, Stratum 5:  3.6516129
#SE BMI Proportionate Allocation: 1.222488667688469
# Design Effect BMI Proportionate Allocation: 1.1420356381190746


#Q1. (c) - Both Design Effects for BMI as well as EVERPREG are greater than 1, which means we may infer that the stratfied sampling is less efficient than what SRS would have been. The whole point of stratified sampling is to increase the efficiency over SRS, and we observe that this does not happen here.




# Q1 (e) Neyman Allocation:

In [9]:
def neyman_allocation(file_path):
    # Load the dataset
    df = population

    # Population sizes for each stratum (Nh)
    Nh = np.array([2319, 2583, 2265, 1284, 849])
    N = np.sum(Nh)  # Total population size

    # Compute standard deviation of BMI per stratum
    std_BMI_stratum = df.groupby('STRATUM')['BMI'].std().values

    # Neyman allocation formula: nh_neyman = (Nh * std_BMI) / sum(Nh * std_BMI)
    nh_neyman = (Nh * std_BMI_stratum) / np.sum(Nh * std_BMI_stratum) * len(df)

    # Compute mean BMI per stratum
    mean_BMI_stratum = df.groupby('STRATUM')['BMI'].mean().values

    # Compute variance of BMI per stratum
    var_BMI_stratum = df.groupby('STRATUM')['BMI'].var().values

    # Compute weighted mean BMI
    weighted_mean_BMI = np.sum((Nh / N) * mean_BMI_stratum)

    # Compute standard error of BMI under Neyman allocation
    SE_BMI_neyman = np.sqrt(np.sum((Nh / N) ** 2 * (var_BMI_stratum / nh_neyman)))

    # Compute design effect under Neyman allocation
    SRS_SE_BMI = np.std(df['BMI']) / np.sqrt(len(df))  # SRS standard error for BMI
    DEFF_BMI_neyman = (SE_BMI_neyman ** 2) / (SRS_SE_BMI ** 2)

    return {
        'Neyman Allocation': nh_neyman,
        'SE BMI Neyman Allocation': SE_BMI_neyman,
        'Design Effect BMI Neyman Allocation': DEFF_BMI_neyman
    }

# Example usage
file_path = 'Sampling I - HW3 data.xlsx'
results = neyman_allocation(population)
print(results)


{'Neyman Allocation': array([ 5.1091279 , 14.55064067, 12.97676182,  4.37404343,  2.98942618]), 'SE BMI Neyman Allocation': 1.154218121641673, 'Design Effect BMI Neyman Allocation': 1.018042127602847}


#Neyman Allocation: array([ 5.1091279 , 14.55064067, 12.97676182,  4.37404343,  2.98942618])
#SE BMI Neyman Allocation: 1.154218121641673
#Design Effect BMI Neyman Allocation: 1.018042127602847

#Q2 : (a) 06h00m
#q2 : (b) 7.5/10
