In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent))
from src.utils import *

import os

In [2]:
DATA_PATH = '../data/final'
OUTPUT_PATH = 'output'
cvd_data_path = f'{DATA_PATH}/gbd_cardiovascular_allAges_final.csv'

In [3]:
def p_value_test(df, disease = 'Cardiovascular diseases', metric = 'Rate', measure = 'Incidence', 
                 iterations = 1_000, base_seed = 123, set_to_means = False):
    df = df.copy()
    
    # Drop global and high income rows
    df = df[df['location_name'] != 'Global']
    df = df[df['location_name'] != 'High-income']
    
    gbd_test = df[(df['measure_name'] == measure) & (df['metric_name'] == metric)]
    if set_to_means:
        gbd_test = gbd_test.groupby('location_name')['val'].mean().reset_index()

    country = 'Germany'
    germany_data = gbd_test[gbd_test['location_name'] == country]
    other_countries_data = gbd_test[gbd_test['location_name'] != country]

    # Define the test statistic (mean difference in this case)
    observed_mean_difference = germany_data['val'].mean() - other_countries_data['val'].mean()

    # Initialize an empty array to store permuted test statistics
    permuted_mean_differences = np.zeros(iterations)
    # Combine Germany's data and permuted data from other countries
    combined_data = pd.concat([germany_data, other_countries_data])

    # Perform the permutation test
    for i in range(iterations):
        new_seed = base_seed + i
        # Shuffle the combined data
        shuffled_data = combined_data.sample(frac=1, random_state=new_seed).reset_index(drop=True)['val']
        # Calculate the mean difference for the permuted data
        permuted_mean_difference = shuffled_data[:len(germany_data)].mean() - shuffled_data[len(germany_data):].mean()
        # Store the permuted mean difference
        permuted_mean_differences[i] = permuted_mean_difference

    # Calculate the p-value
    p_value = (np.abs(permuted_mean_differences) >= np.abs(observed_mean_difference)).mean()

    # Print p-value
    print(f"Permutation Test p-value for {disease}: {p_value}")

In [4]:
df = pd.read_csv(cvd_data_path)
df = df[(df['metric_name'] == 'Rate') & (df['measure_name'] == 'Incidence')]
df

Unnamed: 0,measure_name,location_name,metric_name,year,val,Country Code
12361,Incidence,Democratic People's Republic of Korea,Rate,1990,505.746115,PRK
12363,Incidence,Democratic People's Republic of Korea,Rate,1991,516.447635,PRK
12365,Incidence,Democratic People's Republic of Korea,Rate,1992,526.806601,PRK
12367,Incidence,Democratic People's Republic of Korea,Rate,1993,536.496942,PRK
12369,Incidence,Democratic People's Republic of Korea,Rate,1994,546.016220,PRK
...,...,...,...,...,...,...
24711,Incidence,Sierra Leone,Rate,2015,355.473682,SLE
24713,Incidence,Sierra Leone,Rate,2016,355.201957,SLE
24715,Incidence,Sierra Leone,Rate,2017,355.453785,SLE
24717,Incidence,Sierra Leone,Rate,2018,357.173519,SLE


In [6]:
p_value_test(df, iterations=1_000_000, set_to_means=True)

Permutation Test p-value for Cardiovascular diseases: 0.045364
