In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import StratifiedTable, Table2x2

# Read data from CSV file
df = pd.read_csv('relativerisk.csv')

In [2]:
from tabulate import tabulate
import pandas as pd

# Ganti nilai 0 dan 1 pada kolom 'married' dan 'healthy' sebelum membuat crosstab
df['married'] = df['married'].replace({0: 'no', 1: 'yes'})
df['healthy'] = df['healthy'].replace({0: 'no', 1: 'yes'})

# Buat crosstab antara married dan healthy
crosstab = pd.crosstab(df['married'], df['healthy'], margins=True)

print("Crosstab between married and healthy:")
print(tabulate(crosstab, headers='keys', tablefmt='grid'))

# Hitung persentase dalam married
crosstab_perc = crosstab.div(crosstab['All'], axis=0) * 100
print("\nPercentages (%):")
print(tabulate(crosstab_perc, headers='keys', tablefmt='grid'))

Crosstab between married and healthy:
+-----------+------+-------+-------+
| married   |   no |   yes |   All |
| no        |  192 |  1104 |  1296 |
+-----------+------+-------+-------+
| yes       |  167 |  1537 |  1704 |
+-----------+------+-------+-------+
| All       |  359 |  2641 |  3000 |
+-----------+------+-------+-------+

Percentages (%):
+-----------+----------+---------+-------+
| married   |       no |     yes |   All |
| no        | 14.8148  | 85.1852 |   100 |
+-----------+----------+---------+-------+
| yes       |  9.80047 | 90.1995 |   100 |
+-----------+----------+---------+-------+
| All       | 11.9667  | 88.0333 |   100 |
+-----------+----------+---------+-------+


In [3]:
import numpy as np
from statsmodels.stats.contingency_tables import Table2x2

# Get counts for 2x2 table using "no" and "yes" labels
unmarried_unhealthy = crosstab.loc["no", "no"]
unmarried_healthy = crosstab.loc["no", "yes"]
married_unhealthy = crosstab.loc["yes", "no"]
married_healthy = crosstab.loc["yes", "yes"]

# Construct contingency table
table = np.array([
    [unmarried_unhealthy, unmarried_healthy],
    [married_unhealthy, married_healthy]
])

# Create 2x2 table object
table2x2 = Table2x2(table)

# Calculate odds ratio and confidence interval
oddsratio = table2x2.oddsratio
oddsratio_confint = table2x2.oddsratio_confint()
print(f"\nOdds Ratio (no/yes): {oddsratio}")
print(f"95% Confidence Interval for Odds Ratio:  lower bound: {oddsratio_confint[0]}")
print(f"\t\t\t\t\t upper bound: {oddsratio_confint[1]}")


Odds Ratio (no/yes): 1.6006248372819578
95% Confidence Interval for Odds Ratio:  lower bound: 1.2828176176769495
					 upper bound: 1.9971661087438233


In [5]:
import math

# Get counts for "healthy = no" cohort
unmarried_total_no = unmarried_unhealthy + unmarried_healthy
married_total_no = married_unhealthy + married_healthy

# Probability of being unhealthy in unmarried and married groups (healthy = no)
prob_unmarried_unhealthy_no = unmarried_unhealthy / unmarried_total_no
prob_married_unhealthy_no = married_unhealthy / married_total_no

# Calculate relative risk for "healthy = no"
relative_risk_no = prob_unmarried_unhealthy_no / prob_married_unhealthy_no
print(f"\nRelative Risk (healthy = no): {relative_risk_no}")

# Standard error and 95% confidence interval for relative risk (healthy = no)
se_relative_risk_no = math.sqrt(
    (1 - prob_unmarried_unhealthy_no) / (unmarried_total_no * prob_unmarried_unhealthy_no) +
    (1 - prob_married_unhealthy_no) / (married_total_no * prob_married_unhealthy_no)
)

ci_lower_no = relative_risk_no * math.exp(-1.96 * se_relative_risk_no)
ci_upper_no = relative_risk_no * math.exp(1.96 * se_relative_risk_no)

print(f"95% Confidence Interval for Relative Risk (healthy = no):") 
print(f"lower bound: {ci_lower_no}")
print(f"upper bound: {ci_upper_no}")

# Get counts for "healthy = yes" cohort
unmarried_total_yes = unmarried_healthy + unmarried_unhealthy
married_total_yes = married_healthy + married_unhealthy

# Probability of being healthy in unmarried and married groups (healthy = yes)
prob_unmarried_healthy_yes = unmarried_healthy / unmarried_total_yes
prob_married_healthy_yes = married_healthy / married_total_yes

# Calculate relative risk for "healthy = yes"
relative_risk_yes = prob_unmarried_healthy_yes / prob_married_healthy_yes
print(f"\nRelative Risk (healthy = yes): {relative_risk_yes}")

# Standard error and 95% confidence interval for relative risk (healthy = yes)
se_relative_risk_yes = math.sqrt(
    (1 - prob_unmarried_healthy_yes) / (unmarried_total_yes * prob_unmarried_healthy_yes) +
    (1 - prob_married_healthy_yes) / (married_total_yes * prob_married_healthy_yes)
)

ci_lower_yes = relative_risk_yes * math.exp(-1.96 * se_relative_risk_yes)
ci_upper_yes = relative_risk_yes * math.exp(1.96 * se_relative_risk_yes)

print(f"95% Confidence Interval for Relative Risk (healthy = yes)")
print(f"lower bound: {ci_lower_no}")
print(f"upper bound: {ci_upper_yes}")



Relative Risk (healthy = no): 1.511643379906853
95% Confidence Interval for Relative Risk (healthy = no):
lower bound: 1.2445729778426209
upper bound: 1.8360238802365885

Relative Risk (healthy = yes): 0.9444082989951565
95% Confidence Interval for Relative Risk (healthy = yes)
lower bound: 1.2445729778426209
upper bound: 0.9708142557095893


In [6]:
import pandas as pd
from tabulate import tabulate

# Define age class mapping
ageclass_mapping = {1.0: '30–40', 2: '40–50', 3: '50–60', 4: '60–70'}

# Define ageclasses based on the unique values in df, then map them
ageclasses = sorted(df['ageclass'].unique())
tables = {}

for age in ageclasses:
    # Filter data by ageclass
    df_age = df[df['ageclass'] == age]
    
    # Create crosstab with counts and calculate percentages
    crosstab_count = pd.crosstab(df_age['married'], df_age['healthy'], margins=True, margins_name="Total")
    crosstab_perc = crosstab_count.div(crosstab_count['Total'], axis=0) * 100
    
    # Combine counts and percentages into a single DataFrame
    combined_df = pd.DataFrame()
    combined_df['No (Count)'] = crosstab_count['no']
    combined_df['Yes (Count)'] = crosstab_count['yes']
    combined_df['Total (Count)'] = crosstab_count['Total']
    combined_df['No (% within married)'] = crosstab_perc['no']
    combined_df['Yes (% within married)'] = crosstab_perc['yes']
    combined_df['Total (%)'] = crosstab_perc['Total']
    
    # Map ageclass name for display
    age_label = ageclass_mapping.get(age, age)
    tables[age_label] = combined_df

# Display each ageclass table with grid format
for age, table in tables.items():
    print(f"\nAgeclass {age}")
    print(tabulate(table, headers='keys', tablefmt='grid'))



Ageclass 30–40
+-----------+--------------+---------------+-----------------+-------------------------+--------------------------+-------------+
| married   |   No (Count) |   Yes (Count) |   Total (Count) |   No (% within married) |   Yes (% within married) |   Total (%) |
| no        |           52 |           138 |             190 |                 27.3684 |                  72.6316 |         100 |
+-----------+--------------+---------------+-----------------+-------------------------+--------------------------+-------------+
| yes       |           53 |           327 |             380 |                 13.9474 |                  86.0526 |         100 |
+-----------+--------------+---------------+-----------------+-------------------------+--------------------------+-------------+
| Total     |          105 |           465 |             570 |                 18.4211 |                  81.5789 |         100 |
+-----------+--------------+---------------+-----------------+------------

In [11]:
import pandas as pd
import numpy as np
import math
from statsmodels.stats.contingency_tables import Table2x2

# Read data from CSV file (assuming 'relativerisk.csv' contains 'ageclass', 'married', 'healthy')
df = pd.read_csv('relativerisk.csv')

# Replace 0 and 1 with "no" and "yes" for readability
df['married'] = df['married'].replace({0: 'no', 1: 'yes'})
df['healthy'] = df['healthy'].replace({0: 'no', 1: 'yes'})

# Define age classes
ageclasses = sorted(df['ageclass'].unique())

# Prepare list of odds ratios, variances, and table counts for each ageclass
odds_ratios = []
variances = []
tables = []

for age in ageclasses:
    df_age = df[df['ageclass'] == age]
    crosstab = pd.crosstab(df_age['married'], df_age['healthy'])
    
    # Calculate odds ratio and variance for each age class
    table = np.array([
        [crosstab.loc['no', 'no'], crosstab.loc['no', 'yes']],
        [crosstab.loc['yes', 'no'], crosstab.loc['yes', 'yes']]
    ])
    tables.append(table)
    
    # Calculate odds ratio and variance for this table
    table2x2 = Table2x2(table)
    or_k = table2x2.oddsratio
    log_or_k = math.log(or_k)
    var_k = 1 / table[0, 0] + 1 / table[0, 1] + 1 / table[1, 0] + 1 / table[1, 1]
    
    odds_ratios.append(log_or_k)
    variances.append(var_k)

# Calculate the pooled (weighted) odds ratio
pooled_or = sum(odds_ratios[i] / variances[i] for i in range(len(odds_ratios))) / sum(1 / variances[i] for i in range(len(variances)))
pooled_or = math.exp(pooled_or)  # Convert back from log scale

# Calculate Breslow-Day test statistic
breslow_day_stat = sum((odds_ratios[i] - math.log(pooled_or)) ** 2 / variances[i] for i in range(len(odds_ratios)))

# Degrees of freedom is the number of strata minus 1
df_breslow_day = len(ageclasses) - 1

print(f"Breslow-Day test statistic: {breslow_day_stat}")
print(f"Degrees of freedom: {df_breslow_day}")


Breslow-Day test statistic: 5.3785690463385345
Degrees of freedom: 3


In [12]:
from scipy.stats import chi2

p_value = 1 - chi2.cdf(breslow_day_stat, df_breslow_day)
print(f"P-value for Breslow-Day test: {p_value}")


P-value for Breslow-Day test: 0.14608464522198905


In [15]:
import pandas as pd
import numpy as np
import math
from scipy.stats import chi2
from statsmodels.stats.contingency_tables import Table2x2

# Read data from CSV file (assuming 'relativerisk.csv' contains 'ageclass', 'married', 'healthy')
df = pd.read_csv('relativerisk.csv')

# Replace 0 and 1 with "no" and "yes" for readability
df['married'] = df['married'].replace({0: 'no', 1: 'yes'})
df['healthy'] = df['healthy'].replace({0: 'no', 1: 'yes'})

# Define age classes
ageclasses = sorted(df['ageclass'].unique())

# Prepare list of odds ratios, variances, and table counts for each ageclass
odds_ratios = []
variances = []
tables = []

for age in ageclasses:
    df_age = df[df['ageclass'] == age]
    crosstab = pd.crosstab(df_age['married'], df_age['healthy'])
    
    # Calculate odds ratio and variance for each age class
    table = np.array([
        [crosstab.loc['no', 'no'], crosstab.loc['no', 'yes']],
        [crosstab.loc['yes', 'no'], crosstab.loc['yes', 'yes']]
    ])
    tables.append(table)
    
    # Calculate odds ratio and variance for this table
    table2x2 = Table2x2(table)
    or_k = table2x2.oddsratio
    log_or_k = math.log(or_k)
    var_k = 1 / table[0, 0] + 1 / table[0, 1] + 1 / table[1, 0] + 1 / table[1, 1]
    
    odds_ratios.append(log_or_k)
    variances.append(var_k)

# Calculate the pooled (weighted) odds ratio
pooled_or = sum(odds_ratios[i] / variances[i] for i in range(len(odds_ratios))) / sum(1 / variances[i] for i in range(len(variances)))
pooled_or = math.exp(pooled_or)  # Convert back from log scale

# Calculate Breslow-Day test statistic
breslow_day_stat = sum((odds_ratios[i] - math.log(pooled_or)) ** 2 / variances[i] for i in range(len(odds_ratios)))

# Degrees of freedom is the number of strata minus 1
df_breslow_day = len(ageclasses) - 1

# Calculate asymptotic significance (2-sided) for Breslow-Day
p_value_breslow_day = 1 - chi2.cdf(breslow_day_stat, df_breslow_day)

# Calculate Tarone's test statistic
# For Tarone's test, variances are assumed to be the same for each stratum, so we use the average variance
avg_variance = sum(variances) / len(variances)
tarone_stat = sum((odds_ratios[i] - math.log(pooled_or)) ** 2 / avg_variance for i in range(len(odds_ratios)))

# Degrees of freedom is the number of strata minus 1
df_tarone_stat = len(ageclasses) - 1

# Calculate p-value for Tarone's test
p_value_tarone = 1 - chi2.cdf(tarone_stat, df_tarone_stat)

print(f"Breslow-Day test statistic: {breslow_day_stat}")
print(f"Degrees of freedom: {df_breslow_day}")
print(f"Asymp. Sig. (2-sided) for Breslow-Day test: {p_value_breslow_day}")

print(f"\nTarone's test statistic: {tarone_stat}")
print(f"Degrees of freedom: {df_tarone_stat}")
print(f"Asymp. Sig. (2-sided) for Tarone's test: {p_value_tarone}")


Breslow-Day test statistic: 5.3785690463385345
Degrees of freedom: 3
Asymp. Sig. (2-sided) for Breslow-Day test: 0.14608464522198905

Tarone's test statistic: 5.551745374432485
Degrees of freedom: 3
Asymp. Sig. (2-sided) for Tarone's test: 0.13557622129353464


In [19]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import StratifiedTable

# Read data from CSV file (assuming 'relativerisk.csv' contains 'ageclass', 'married', 'healthy')
df = pd.read_csv('relativerisk.csv')

# Replace 0 and 1 with "no" and "yes" for readability
df['married'] = df['married'].replace({0: 'no', 1: 'yes'})
df['healthy'] = df['healthy'].replace({0: 'no', 1: 'yes'})

# Define age classes
ageclasses = sorted(df['ageclass'].unique())

# Prepare a list of 2x2 tables for each ageclass
tables = []

for age in ageclasses:
    df_age = df[df['ageclass'] == age]
    crosstab = pd.crosstab(df_age['married'], df_age['healthy'])
    
    # Create a 2x2 table and add to the list
    table = np.array([
        [crosstab.loc['no', 'no'], crosstab.loc['no', 'yes']],
        [crosstab.loc['yes', 'no'], crosstab.loc['yes', 'yes']]
    ])
    tables.append(table)

# Use StratifiedTable to create a stratified object from the tables
stratified_table = StratifiedTable(tables)

# Perform the Mantel-Haenszel Chi-Squared Test (also CMH Test)
mantel_haenszel_result = stratified_table.test_null_odds()
mantel_haenszel_stat = mantel_haenszel_result.statistic
mantel_haenszel_p = mantel_haenszel_result.pvalue

# Since this is a 1-degree-of-freedom test, we can report it as such
mantel_haenszel_df = 1

# Output results
print("Cochran-Mantel-Haenszel Chi-Squared Test:")
print(f"CMH Chi-squared: {mantel_haenszel_stat}")
print(f"Degrees of freedom: {mantel_haenszel_df}")
print(f"Asymp. Sig. (2-sided) for CMH Test: {mantel_haenszel_p}")

print("\nMantel-Haenszel Chi-Squared Test:")
print(f"Chi-squared: {mantel_haenszel_stat}")
print(f"Degrees of freedom: {mantel_haenszel_df}")
print(f"Asymp. Sig. (2-sided) for Mantel-Haenszel Test: {mantel_haenszel_p}")


Cochran-Mantel-Haenszel Chi-Squared Test:
CMH Chi-squared: 26.090522527525195
Degrees of freedom: 1
Asymp. Sig. (2-sided) for CMH Test: 3.257791911792651e-07

Mantel-Haenszel Chi-Squared Test:
Chi-squared: 26.090522527525195
Degrees of freedom: 1
Asymp. Sig. (2-sided) for Mantel-Haenszel Test: 3.257791911792651e-07


In [16]:
# Create StratifiedTable
mh_table = StratifiedTable(tables)

# Mantel-Haenszel common odds ratio
mh_or = mh_table.oddsratio_pooled
mh_or_confint = mh_table.oddsratio_pooled_confint()
print(f"\nMantel-Haenszel common Odds Ratio: {mh_or}")
print(f"95% Confidence Interval: {mh_or_confint}")


Mantel-Haenszel common Odds Ratio: 1.780649273613681
95% Confidence Interval: (1.4217473900860704, 2.230151331896575)


In [1]:
import pandas as pd
import numpy as np
import math
from scipy.stats import norm
from statsmodels.stats.contingency_tables import StratifiedTable

# Read data from CSV file (assuming 'relativerisk.csv' contains 'ageclass', 'married', 'healthy')
df = pd.read_csv('relativerisk.csv')

# Replace 0 and 1 with "no" and "yes" for readability
df['married'] = df['married'].replace({0: 'no', 1: 'yes'})
df['healthy'] = df['healthy'].replace({0: 'no', 1: 'yes'})

# Define age classes
ageclasses = sorted(df['ageclass'].unique())

# Prepare a list of 2x2 tables for each ageclass
tables = []

for age in ageclasses:
    df_age = df[df['ageclass'] == age]
    crosstab = pd.crosstab(df_age['married'], df_age['healthy'])
    
    # Create a 2x2 table and add to the list
    table = np.array([
        [crosstab.loc['no', 'no'], crosstab.loc['no', 'yes']],
        [crosstab.loc['yes', 'no'], crosstab.loc['yes', 'yes']]
    ])
    tables.append(table)

# Use StratifiedTable to create a stratified object from the tables
stratified_table = StratifiedTable(tables)

# Calculate the Mantel-Haenszel common odds ratio and its log
common_odds_ratio = stratified_table.oddsratio_pooled
ln_common_odds_ratio = math.log(common_odds_ratio)

# Calculate the standard error of the log odds ratio manually
# Standard error is based on the sum of the reciprocals of each cell in the 2x2 tables
variance_ln_or = sum(1 / (table[0, 0]) + 1 / (table[0, 1]) + 1 / (table[1, 0]) + 1 / (table[1, 1]) for table in tables)
std_error_ln_or = math.sqrt(variance_ln_or)

# Calculate 95% confidence intervals for the common odds ratio
z_score = norm.ppf(0.975)  # for 95% confidence level

# Confidence interval for ln(Common odds ratio)
ci_lower_ln_or = ln_common_odds_ratio - z_score * std_error_ln_or
ci_upper_ln_or = ln_common_odds_ratio + z_score * std_error_ln_or

# Confidence interval for Common odds ratio
ci_lower_or = math.exp(ci_lower_ln_or)
ci_upper_or = math.exp(ci_upper_ln_or)

# Calculate Asymptotic Significance (2-sided) for the log odds ratio
z_value = ln_common_odds_ratio / std_error_ln_or
p_value = 2 * (1 - norm.cdf(abs(z_value)))  # 2-sided p-value

# Print the results
print("Mantel-Haenszel common odds ratio estimate:")
print(f"Estimate (Common Odds Ratio): {common_odds_ratio}")
print(f"ln(Estimate): {ln_common_odds_ratio}")
print(f"Std. Error of ln(Estimate): {std_error_ln_or}")
print(f"Asymp. Sig. (2-sided): {p_value}")
print("\nAsymp. 95% confidence interval:")
print(f"Common odds ratio: Lower bound: {ci_lower_or}, Upper bound: {ci_upper_or}")
print(f"ln(Common odds ratio): Lower bound: {ci_lower_ln_or}, Upper bound: {ci_upper_ln_or}")


Mantel-Haenszel common odds ratio estimate:
Estimate (Common Odds Ratio): 1.780649273613681
ln(Estimate): 0.5769780582521951
Std. Error of ln(Estimate): 0.49737970296370654
Asymp. Sig. (2-sided): 0.2460343971681589

Asymp. 95% confidence interval:
Common odds ratio: Lower bound: 0.6717505275178488, Upper bound: 4.7200734584263975
ln(Common odds ratio): Lower bound: -0.39786824619789973, Upper bound: 1.5518243627022899
