In [65]:
import pandas as pd
import numpy as np
import math

from scipy.stats import chi2
from statsmodels.stats.contingency_tables import StratifiedTable, Table2x2
from tabulate import tabulate
from scipy.stats import norm

# Read data from CSV file
df = pd.read_csv('relativerisk.csv')
df.head()

Unnamed: 0,ageclass,married,healthy
0,4.0,1,0
1,3.0,0,0
2,2.0,1,0
3,1.0,1,0
4,4.0,1,0


In [66]:
# Buat crosstab antara married dan healthy
crosstab = pd.crosstab(df['married'], df['healthy'], margins=True)

print("Crosstab between married and healthy:")
print(tabulate(crosstab, headers='keys', tablefmt='grid'))

# Hitung persentase dalam married
crosstab_perc = crosstab.div(crosstab['All'], axis=0) * 100
print("\nPercentages (%):")
print(tabulate(crosstab_perc, headers='keys', tablefmt='grid'))

Crosstab between married and healthy:
+-----------+-----+------+-------+
| married   |   0 |    1 |   All |
| 0         | 192 | 1104 |  1296 |
+-----------+-----+------+-------+
| 1         | 167 | 1537 |  1704 |
+-----------+-----+------+-------+
| All       | 359 | 2641 |  3000 |
+-----------+-----+------+-------+

Percentages (%):
+-----------+----------+---------+-------+
| married   |        0 |       1 |   All |
| 0         | 14.8148  | 85.1852 |   100 |
+-----------+----------+---------+-------+
| 1         |  9.80047 | 90.1995 |   100 |
+-----------+----------+---------+-------+
| All       | 11.9667  | 88.0333 |   100 |
+-----------+----------+---------+-------+


In [67]:
# Get counts for 2x2 table using "unmarried"/"married" and "unhealthy"/"healthy" labels
unmarried_unhealthy = crosstab.loc["unmarried", "unhealthy"]
unmarried_healthy = crosstab.loc["unmarried", "healthy"]
married_unhealthy = crosstab.loc["married", "unhealthy"]
married_healthy = crosstab.loc["married", "healthy"]

# Construct contingency table
table = np.array([
    [unmarried_unhealthy, unmarried_healthy],
    [married_unhealthy, married_healthy]
])

# Create 2x2 table object
table2x2 = Table2x2(table)

# Calculate odds ratio and confidence interval
oddsratio = table2x2.oddsratio
oddsratio_confint = table2x2.oddsratio_confint()
print(f"\nOdds Ratio (no/yes): {oddsratio}")
print(f"95% Confidence Interval for Odds Ratio:")  
print(f"lower bound: {oddsratio_confint[0]}")
print(f"upper bound: {oddsratio_confint[1]}")

KeyError: 'unhealthy'

In [None]:
unmarried_total = unmarried_unhealthy + unmarried_healthy
married_total = married_unhealthy + married_healthy

# Probability of being unhealthy in unmarried and married groups
prob_unmarried_unhealthy = unmarried_unhealthy / unmarried_total
prob_married_unhealthy = married_unhealthy / married_total

# Calculate relative risk for "unhealthy"
relative_risk_unhealthy = prob_unmarried_unhealthy / prob_married_unhealthy
print(f"\nRelative Risk (unhealthy): {relative_risk_unhealthy}")

# Standard error and 95% confidence interval for relative risk (unhealthy)
se_relative_risk_unhealthy = math.sqrt(
    (1 - prob_unmarried_unhealthy) / (unmarried_total * prob_unmarried_unhealthy) +
    (1 - prob_married_unhealthy) / (married_total * prob_married_unhealthy)
)

ci_lower_unhealthy = relative_risk_unhealthy * math.exp(-1.96 * se_relative_risk_unhealthy)
ci_upper_unhealthy = relative_risk_unhealthy * math.exp(1.96 * se_relative_risk_unhealthy)

print(f"95% Confidence Interval for Relative Risk (healthy = no):") 
print(f"lower bound: {ci_lower_no}")
print(f"upper bound: {ci_upper_no}")


Relative Risk (unhealthy): 1.511643379906853
95% Confidence Interval for Relative Risk (healthy = no):
lower bound: 1.2445729778426209
upper bound: 1.8360238802365885


In [None]:
# Probability of being healthy in unmarried and married groups 
prob_unmarried_healthy = unmarried_healthy / unmarried_total
prob_married_healthy = married_healthy / married_total

# Calculate relative risk for "healthy = healthy"
relative_risk_healthy = prob_unmarried_healthy / prob_married_healthy
print(f"\nRelative Risk (healthy): {relative_risk_healthy}")

# Standard error and 95% confidence interval for relative risk (healthy)
se_relative_risk_healthy = math.sqrt(
    (1 - prob_unmarried_healthy) / (unmarried_total * prob_unmarried_healthy) +
    (1 - prob_married_healthy) / (married_total * prob_married_healthy)
)

ci_lower_healthy = relative_risk_healthy * math.exp(-1.96 * se_relative_risk_healthy)
ci_upper_healthy = relative_risk_healthy * math.exp(1.96 * se_relative_risk_healthy)

print(f"95% Confidence Interval for Relative Risk (healthy = yes):")
print(f"lower bound: {ci_lower_no}")
print(f"upper bound: {ci_upper_yes}")


Relative Risk (healthy): 0.9444082989951565
95% Confidence Interval for Relative Risk (healthy = yes):
lower bound: 1.2445729778426209
upper bound: 0.9708142557095893


In [None]:
# Define age class mapping
ageclass_mapping = {1.0: '30-40', 2: '40-50', 3: '50-60', 4: '60-70'}

# Define ageclasses based on the unique values in df, then map them
ageclasses = sorted(df['ageclass'].unique())
tables = {}

for age in ageclasses:
    # Filter data by ageclass
    df_age = df[df['ageclass'] == age]
    
    # Create crosstab with counts and calculate percentages
    crosstab_count = pd.crosstab(df_age['married'], df_age['healthy'], margins=True, margins_name="Total")
    crosstab_perc = crosstab_count.div(crosstab_count['Total'], axis=0) * 100
    
# Combine counts and percentages into a single DataFrame
    combined_df = pd.DataFrame()
    combined_df['Unhealthy (Count)'] = crosstab_count['unhealthy']
    combined_df['Healthy (Count)'] = crosstab_count['healthy']
    combined_df['Total (Count)'] = crosstab_count['Total']
    combined_df['Unhealthy (% within married)'] = crosstab_perc['unhealthy']
    combined_df['Healthy (% within married)'] = crosstab_perc['healthy']
    combined_df['Total (%)'] = crosstab_perc['Total']
    
    # Map ageclass name for display
    age_label = ageclass_mapping.get(age, age)
    tables[age_label] = combined_df

# Display each ageclass table with grid format
for age, table in tables.items():
    print(f"\nAgeclass {age}")
    print(tabulate(table, headers='keys', tablefmt='grid'))



Ageclass 30-40
+-----------+---------------------+-------------------+-----------------+--------------------------------+------------------------------+-------------+
| married   |   Unhealthy (Count) |   Healthy (Count) |   Total (Count) |   Unhealthy (% within married) |   Healthy (% within married) |   Total (%) |
| married   |                  53 |               327 |             380 |                        13.9474 |                      86.0526 |         100 |
+-----------+---------------------+-------------------+-----------------+--------------------------------+------------------------------+-------------+
| unmarried |                  52 |               138 |             190 |                        27.3684 |                      72.6316 |         100 |
+-----------+---------------------+-------------------+-----------------+--------------------------------+------------------------------+-------------+
| Total     |                 105 |               465 |             570 

In [None]:
# Adding 95% confidence intervals for relative risk estimates (healthy=no and healthy=yes)
z_score = norm.ppf(0.975)  # for 95% confidence level

# Extended results storage
results_with_ci = []

for age in ageclasses:
    # Filter data by age class
    df_age = df[df['ageclass'] == age]
    age_label = ageclass_mapping.get(age, age)
    
    # Create crosstab
    crosstab = pd.crosstab(df_age['married'], df_age['healthy'])
    
    # Ensure the table has the required structure for 2x2 table calculations
    if 'unmarried' in crosstab.index and 'married' in crosstab.index and 'unhealthy' in crosstab.columns and 'healthy' in crosstab.columns:
        table = np.array([
            [crosstab.loc['unmarried', 'unhealthy'], crosstab.loc['unmarried', 'healthy']],
            [crosstab.loc['married', 'unhealthy'], crosstab.loc['married', 'healthy']]
        ])

        # Odds Ratio calculation
        table2x2 = Table2x2(table)
        odds_ratio = table2x2.oddsratio
        or_confint = table2x2.oddsratio_confint()

        # Probabilities for unhealthy and healthy
        total_unmarried = table[0].sum()
        total_married = table[1].sum()

        prob_unmarried_unhealthy = table[0, 0] / total_unmarried if total_unmarried > 0 else np.nan
        prob_married_unhealthy = table[1, 0] / total_married if total_married > 0 else np.nan

        prob_unmarried_healthy = table[0, 1] / total_unmarried if total_unmarried > 0 else np.nan
        prob_married_healthy = table[1, 1] / total_married if total_married > 0 else np.nan

        # Calculate relative risks
        relative_risk_unhealthy = prob_unmarried_unhealthy / prob_married_unhealthy if prob_married_unhealthy > 0 else np.nan
        relative_risk_healthy = prob_unmarried_healthy / prob_married_healthy if prob_married_healthy > 0 else np.nan

        # Standard error and 95% CI for relative risk (healthy=no)
        if prob_unmarried_unhealthy > 0 and prob_married_unhealthy > 0:
            se_relative_risk_unhealthy = math.sqrt(
                (1 - prob_unmarried_unhealthy) / (total_unmarried * prob_unmarried_unhealthy) +
                (1 - prob_married_unhealthy) / (total_married * prob_married_unhealthy)
            )
            ci_lower_unhealthy = relative_risk_unhealthy * math.exp(-z_score * se_relative_risk_unhealthy)
            ci_upper_unhealthy = relative_risk_unhealthy * math.exp(z_score * se_relative_risk_unhealthy)
        else:
            ci_lower_unhealthy = ci_upper_unhealthy = np.nan

        # Standard error and 95% CI for relative risk (healthy=yes)
        if prob_unmarried_healthy > 0 and prob_married_healthy > 0:
            se_relative_risk_healthy = math.sqrt(
                (1 - prob_unmarried_healthy) / (total_unmarried * prob_unmarried_healthy) +
                (1 - prob_married_healthy) / (total_married * prob_married_healthy)
            )
            ci_lower_healthy = relative_risk_healthy * math.exp(-z_score * se_relative_risk_healthy)
            ci_upper_healthy = relative_risk_healthy * math.exp(z_score * se_relative_risk_healthy)
        else:
            ci_lower_healthy = ci_upper_healthy = np.nan

        # Valid case count
        valid_cases = df_age.shape[0]
        
        # Collect results with confidence intervals
        results_with_ci.append({
            'Ageclass': age_label,
            'Odds Ratio for married (no/yes)': odds_ratio,
            'Odds Ratio CI Lower': or_confint[0],
            'Odds Ratio CI Upper': or_confint[1],
            'For cohort healthy=no': relative_risk_unhealthy,
            '95% CI Lower (healthy=no)': ci_lower_unhealthy,
            '95% CI Upper (healthy=no)': ci_upper_unhealthy,
            'For cohort healthy=yes': relative_risk_healthy,
            '95% CI Lower (healthy=yes)': ci_lower_healthy,
            '95% CI Upper (healthy=yes)': ci_upper_healthy,
            'N of valid cases': valid_cases
        })

# Display updated results
results_with_ci_df = pd.DataFrame(results_with_ci)
results_with_ci_df

Unnamed: 0,Ageclass,Odds Ratio for married (no/yes),Odds Ratio CI Lower,Odds Ratio CI Upper,For cohort healthy=no,95% CI Lower (healthy=no),95% CI Upper (healthy=no),For cohort healthy=yes,95% CI Lower (healthy=yes),95% CI Upper (healthy=yes),N of valid cases
0,30-40,2.324856,1.510517,3.578217,1.962264,1.395805,2.758609,0.844037,0.766614,0.929279,570
1,40-50,1.734947,1.20909,2.489511,1.614493,1.18042,2.208187,0.930572,0.885666,0.977754,1081
2,50-60,2.351771,1.253768,4.411361,2.186489,1.227015,3.89623,0.92972,0.879486,0.982823,533
3,60-70,1.145278,0.702901,1.866071,1.131579,0.724928,1.766343,0.988038,0.946473,1.031429,816


In [None]:
# Prepare list of odds ratios, variances, and table counts for each ageclass
odds_ratios = []
variances = []
tables = []

for age in ageclasses:
    df_age = df[df['ageclass'] == age]
    crosstab = pd.crosstab(df_age['married'], df_age['healthy'])
    
    # Calculate odds ratio and variance for each age class
    table = np.array([
        [crosstab.loc['unmarried', 'unhealthy'], crosstab.loc['unmarried', 'healthy']],
        [crosstab.loc['married', 'unhealthy'], crosstab.loc['married', 'healthy']]
    ])
    tables.append(table)
    
    # Calculate odds ratio and variance for this table
    table2x2 = Table2x2(table)
    or_k = table2x2.oddsratio
    log_or_k = math.log(or_k)
    var_k = 1 / table[0, 0] + 1 / table[0, 1] + 1 / table[1, 0] + 1 / table[1, 1]
    
    odds_ratios.append(log_or_k)
    variances.append(var_k)

# Calculate the pooled (weighted) odds ratio
pooled_or = sum(odds_ratios[i] / variances[i] for i in range(len(odds_ratios))) / sum(1 / variances[i] for i in range(len(variances)))
pooled_or = math.exp(pooled_or)  # Convert back from log scale

# Calculate Breslow-Day test statistic
breslow_day_stat = sum((odds_ratios[i] - math.log(pooled_or)) ** 2 / variances[i] for i in range(len(odds_ratios)))

# Degrees of freedom is the number of strata minus 1
df_breslow_day = len(ageclasses) - 1

print(f"Breslow-Day test statistic: {breslow_day_stat}")
print(f"Degrees of freedom: {df_breslow_day}")

p_value = 1 - chi2.cdf(breslow_day_stat, df_breslow_day)
print(f"P-value for Breslow-Day test: {p_value}")

Breslow-Day test statistic: 5.3785690463385345
Degrees of freedom: 3
P-value for Breslow-Day test: 0.14608464522198905


In [None]:
# Calculate Tarone's test statistic
avg_variance = sum(variances) / len(variances)
tarone_stat = sum((odds_ratios[i] - math.log(pooled_or)) ** 2 / avg_variance for i in range(len(odds_ratios)))

# Degrees of freedom is the number of strata minus 1
df_tarone_stat = len(ageclasses) - 1

# Calculate p-value for Tarone's test
p_value_tarone = 1 - chi2.cdf(tarone_stat, df_tarone_stat)

print(f"\nTarone's test statistic: {tarone_stat}")
print(f"Degrees of freedom: {df_tarone_stat}")
print(f"Asymp. Sig. (2-sided) for Tarone's test: {p_value_tarone}")



Tarone's test statistic: 5.551745374432485
Degrees of freedom: 3
Asymp. Sig. (2-sided) for Tarone's test: 0.13557622129353464


In [68]:
# Use StratifiedTable to create a stratified object from the tables
stratified_table = StratifiedTable(tables)

# Perform the Mantel-Haenszel Chi-Squared Test (also CMH Test)
mantel_haenszel_result = stratified_table.test_null_odds()
mantel_haenszel_stat = mantel_haenszel_result.statistic
mantel_haenszel_p = mantel_haenszel_result.pvalue
mantel_haenszel_df = 1

print("\nCochran-Mantel-Haenszel Chi-Squared Test:")
print(f"CMH Chi-squared: {mantel_haenszel_stat}")
print(f"Degrees of freedom: {mantel_haenszel_df}")
print(f"Asymp. Sig. (2-sided) for CMH Test: {mantel_haenszel_p}")

# Mantel-Haenszel common odds ratio
mh_or = stratified_table.oddsratio_pooled
mh_or_confint = stratified_table.oddsratio_pooled_confint()
print(f"\nMantel-Haenszel common Odds Ratio: {mh_or}")
print(f"95% Confidence Interval: {mh_or_confint}")



Cochran-Mantel-Haenszel Chi-Squared Test:
CMH Chi-squared: 26.090522527525195
Degrees of freedom: 1
Asymp. Sig. (2-sided) for CMH Test: 3.257791911792651e-07

Mantel-Haenszel common Odds Ratio: 1.780649273613681
95% Confidence Interval: (1.4217473900860704, 2.230151331896575)


In [None]:
# Calculate the Mantel-Haenszel common odds ratio and its log
common_odds_ratio = stratified_table.oddsratio_pooled
ln_common_odds_ratio = math.log(common_odds_ratio)

# Calculate the standard error of the log odds ratio manually
variance_ln_or = sum(1 / (table[0, 0]) + 1 / (table[0, 1]) + 1 / (table[1, 0]) + 1 / (table[1, 1]) for table in tables)
std_error_ln_or = math.sqrt(variance_ln_or)

# 95% confidence intervals for the common odds ratio
z_score = norm.ppf(0.975)  # for 95% confidence level
ci_lower_ln_or = ln_common_odds_ratio - z_score * std_error_ln_or
ci_upper_ln_or = ln_common_odds_ratio + z_score * std_error_ln_or
ci_lower_or = math.exp(ci_lower_ln_or)
ci_upper_or = math.exp(ci_upper_ln_or)

# Asymptotic Significance (2-sided) for the log odds ratio
z_value = ln_common_odds_ratio / std_error_ln_or
p_value = 2 * (1 - norm.cdf(abs(z_value)))

print("Mantel-Haenszel common odds ratio estimate:")
print(f"Estimate (Common Odds Ratio): {common_odds_ratio}")
print(f"ln(Estimate): {ln_common_odds_ratio}")
print(f"Std. Error of ln(Estimate): {std_error_ln_or}")
print(f"Asymp. Sig. (2-sided): {p_value}")
print("\nAsymp. 95% confidence interval:")
print(f"Common odds ratio: Lower bound: {ci_lower_or}, Upper bound: {ci_upper_or}")
print(f"ln(Common odds ratio): Lower bound: {ci_lower_ln_or}, Upper bound: {ci_upper_ln_or}")

Mantel-Haenszel common odds ratio estimate:
Estimate (Common Odds Ratio): 1.780649273613681
ln(Estimate): 0.5769780582521951
Std. Error of ln(Estimate): 0.49737970296370654
Asymp. Sig. (2-sided): 0.2460343971681589

Asymp. 95% confidence interval:
Common odds ratio: Lower bound: 0.6717505275178488, Upper bound: 4.7200734584263975
ln(Common odds ratio): Lower bound: -0.39786824619789973, Upper bound: 1.5518243627022899
