<a href="https://colab.research.google.com/github/tomheston/A-Comparison-of-the-Relative-Risk-Index-with-Unit-Fragilty/blob/main/UFI_vs_RRI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#
# Thomas F. Heston
# GNU GPL v3.0
# I am an academic, please give me a citation if you use this. Thank you!
#
# Importing essential libraries for data handling and statistical analysis
import pandas as pd
from scipy.stats import fisher_exact, pearsonr
from scipy.stats import chi2_contingency
import numpy as np

# set variables
lowran = 15 # lowest number in the 2x2 table
highestran = 251 # highest number in 2x2 table
toppv=0.05 # highest p-value
lowpv=0.000999 # lowest p-value
samplesize = 15

# Function to generate data and compute p-values, defaulting to 50 rows
def generate_data_and_pvalues(rows=samplesize):
    data = []
    # start loop
    while len(data) < rows:
        highran = np.random.randint(45, highestran)  # Choose a random integer between 45 and 250
        ao, bo, co, do = [int(x) for x in np.random.randint(lowran, highran, 4)]
        zero_found = False  # Add this flag at the beginning of the loop

        # we will use chisquare to calculate p-values
        _, pv, _, _ = chi2_contingency([[ao, bo], [co, do]])

        # UFI is calculated only for a limited p-value range
        if lowpv < pv < toppv:
            ufi = 1
            pv1 = 0
            smallest = min(ao, bo, co, do)
            a1, b1, c1, d1 = ao, bo, co, do
            while pv1 < toppv:
                if smallest in (ao, do):
                    a1, b1, c1, d1 = ao+ufi, bo-ufi, co-ufi, do+ufi
                elif smallest in (bo, co):
                    a1, b1, c1, d1 = ao-ufi, bo+ufi, co+ufi, do-ufi

                if any(val == 0 for val in [a1, b1, c1, d1]):
                    zero_found = True  # Set the flag to True if zero found
                    break
                # calculate change in p-value (pv1)
                _, pv1, _, _ = chi2_contingency([[a1, b1], [c1, d1]])
                #
                if pv1 < toppv:
                    ufi += 1
            if zero_found:  # Check the flag after the inner loop
              continue  # Continue the outer loop if zero found

            # RRI calculations
            ppv1 = ao / (ao + bo)
            ppv2 = co / (co + do)
            inc_ppv = 1
            ao2, bo2, co2, do2 = ao, bo, co, do
            total = ao + bo + co + do
            FQ = ufi / total
            RRI = abs((bo*co-ao*do)/(ao+bo+co+do)) # exact RR index, newfi was the unit RR index, adjust all cells to keep marginal totals fixed
            pRRI = RRI/(ao + bo + co + do) # this is the average percent change in ao, bo, co and do to get them to a RR of 1.
            data.append([ao, bo, co, do, pv, a1, b1, c1, d1, pv1, ufi, FQ, ao2, bo2, co2, do2, ppv1, ppv2, RRI, pRRI])

    columns = ['ao', 'bo', 'co', 'do', 'pv', 'a1', 'b1', 'c1', 'd1', 'pv1', 'ufi', 'FQ', 'ao2', 'bo2', 'co2', 'do2', 'ppv1', 'ppv2', 'RRI','pRRI']
    df = pd.DataFrame(data, columns=columns)
    df['size'] = df['ao'] + df['bo'] + df['co'] + df['do']
    df['pv'] = df['pv'].round(7)
    df['pv1'] = df['pv1'].round(7)

    # Correlated with p-value
    corr_ufi_pv, p_value_ufi_pv = [round(val, 5) for val in pearsonr(df['ufi'], df['pv'])]
    corr_FQ_pv, p_value_FQ_pv = [round(val, 5) for val in pearsonr(df['FQ'], df['pv'])]
    corr_RRI_pv, p_value_RRI_pv = [round(val, 5) for val in pearsonr(df['RRI'], df['pv'])]
    corr_pRRI_pv, p_value_pRRI_pv = [round(val, 5) for val in pearsonr(df['pRRI'], df['pv'])]
    # Correlate with sample size
    corr_ufi_size, p_value_ufi_size = [round(val, 5) for val in pearsonr(df['ufi'], df['size'])]
    corr_FQ_size, p_value_FQ_size = [round(val, 5) for val in pearsonr(df['FQ'], df['size'])]
    corr_RRI_size, p_value_RRI_size = [round(val, 5) for val in pearsonr(df['RRI'], df['size'])]
    corr_pRRI_size, p_value_pRRI_size = [round(val, 5) for val in pearsonr(df['pRRI'], df['size'])]

    # Calculate the averages
    average_pv = df['pv'].mean()
    std_pv = df['pv'].std()
    cv_pv = (std_pv / average_pv) * 100
    low_pv = df['pv'].min()
    high_pv = df['pv'].max()
    average_pv_formatted = "{:.5f}".format(average_pv)
    std_pv_formatted = "{:.5f}".format(std_pv)
    cv_pv_formatted = "{:.1f}".format(cv_pv)
    low_pv_formated = "{:.5f}".format(low_pv)
    high_pv_formated = "{:.5f}".format(high_pv)

    average_size = df['size'].mean()
    std_size = df['size'].std()
    cv_size = (std_size / average_size) * 100
    low_size = df['size'].min()
    high_size = df['size'].max()
    average_size_formatted = "{:.1f}".format(average_size)
    std_size_formatted = "{:.1f}".format(std_size)
    cv_size_formatted = "{:.1f}".format(cv_size)
    low_size_formated = "{:.1f}".format(low_size)
    high_size_formated = "{:.1f}".format(high_size)

    average_ufi = df['ufi'].mean()
    std_ufi = (df['ufi'].std())
    cv_ufi = (std_ufi / average_ufi) * 100  # Coefficient of variation for ufi
    low_ufi = df['ufi'].min()
    high_ufi = df['ufi'].max()
    average_ufi_formatted = "{:.3f}".format(average_ufi)
    std_ufi_formatted = "{:.3f}".format(std_ufi)
    cv_ufi_formatted = "{:.1f}".format(cv_ufi)  # Format to three decimal places
    low_ufi_formated = "{:.1f}".format(low_ufi)
    high_ufi_formated = "{:.1f}".format(high_ufi)

    average_FQ = df['FQ'].mean()
    std_FQ = (df['FQ'].std())
    cv_FQ = (std_FQ / average_FQ) * 100  # Coefficient of variation for FQ
    low_FQ = df['FQ'].min()
    high_FQ = df['FQ'].max()
    average_FQ_formatted = "{:.5f}".format(average_FQ)
    std_FQ_formatted = "{:.5f}".format(std_FQ)
    cv_FQ_formatted = "{:.1f}".format(cv_FQ)  # Format to three decimal places
    low_FQ_formated = "{:.5f}".format(low_FQ)
    high_FQ_formated = "{:.5f}".format(high_FQ)

    average_RRI = df['RRI'].mean()
    std_RRI = df['RRI'].std()
    cv_RRI = (std_RRI / average_RRI) * 100  # Coefficient of variation for ufi
    low_RRI = df['RRI'].min()
    high_RRI = df['RRI'].max()
    average_RRI_formatted="{:.2f}".format(average_RRI)
    std_RRI_formatted = "{:.2f}".format(std_RRI)
    cv_RRI_formatted = "{:.1f}".format(cv_RRI)  # Format to three decimal places
    low_RRI_formated = "{:.2f}".format(low_RRI)
    high_RRI_formated = "{:.2f}".format(high_RRI)

    average_pRRI = df['pRRI'].mean()
    std_pRRI = df['pRRI'].std()
    cv_pRRI = (std_pRRI / average_pRRI) * 100
    low_pRRI = df['pRRI'].min()
    high_pRRI = df['pRRI'].max()
    average_pRRI_formatted = "{:.5f}".format(average_pRRI)
    std_pRRI_formatted = "{:.5f}".format(std_pRRI)
    cv_pRRI_formatted = "{:.1f}".format(cv_pRRI)
    low_pRRI_formated = "{:.5f}".format(low_pRRI)
    high_pRRI_formated = "{:.5f}".format(high_pRRI)

    # Print out base settings
    print("BASE SETTINGS")
    print("Total Cases Tested:", samplesize)
    print("Lowest number in 2x2 table:", lowran)
    print("Highest number in 2x2 table:", highestran)
    print("Lowest p-value:", lowpv)
    print("Highest p-value:", toppv)
    print()
    # Print out correlation results with p-values
    print("CORRELATIONS")
    print("Correlation between UFI and pv:", corr_ufi_pv, "p-value:", p_value_ufi_pv)
    print("Correlation between FQ and pv:", corr_FQ_pv, "p-value:", p_value_FQ_pv)
    print("Correlation between RRI (exact RR index) and pv:", corr_RRI_pv, "p-value:", p_value_RRI_pv)
    print("Correlation between pRRI (percent RR index) and pv:", corr_pRRI_pv, "p-value:", p_value_pRRI_pv)
    # Print out correlation results with sample size
    print ("Correlation between UFI and sample size:", corr_ufi_size, "p-value:", p_value_ufi_size)
    print ("Correlation between FQ and sample size:", corr_FQ_size, "p-value:", p_value_FQ_size)
    print ("Correlation between RRI and sample size:", corr_RRI_size, "p-value:", p_value_RRI_size)
    print ("Correlation between pRRI and sample size:", corr_pRRI_size, "p-value:", p_value_pRRI_size)
    print()
    # Print out averages
    print("AVERAGES, STANDARD DEVIATIONS,COEFFICIENTS OF VARIATION AND RANGE")
    print("Average PV, STD, CV (minimum, maximum):", average_pv_formatted, ",", std_pv_formatted, ",", cv_pv_formatted, "% (", low_pv_formated, ",", high_pv_formated, ")")
    print("Average Sample Size of Each Case, STD, CV (minimum, maximum):", average_size_formatted, ",", std_size_formatted, ",", cv_size_formatted, "% (", low_size_formated, ",", high_size_formated, ")")
    print("Average UFI, STD, CV (minimum, maximum):", average_ufi_formatted, ",", std_ufi_formatted, ",", cv_ufi_formatted, "% (", low_ufi_formated, ",", high_ufi_formated, ")")
    print("Average FQ, STD, CV (minimum, maximum):", average_FQ_formatted, ",", std_FQ_formatted, ",", cv_FQ_formatted, "% (", low_FQ_formated, ",", high_FQ_formated, ")")
    print("Average RRI, STD, CV (minimum, maximum):", average_RRI_formatted, ",", std_RRI_formatted, ",", cv_RRI_formatted, "% (", low_RRI_formated, ",", high_RRI_formated, ")")
    print("Average pRRI, STD, CV (minimum, maximum):", average_pRRI_formatted, ",", std_pRRI_formatted, ",", cv_pRRI_formatted, "% (", low_pRRI_formated, ",", high_pRRI_formated, ")" )
    print()
    return df

# Generate the DataFrame
df = generate_data_and_pvalues()

# Print the DataFrame to a file (optional)
df.to_csv('2x2_tables.csv', index=False)
from google.colab import files
files.download('2x2_tables.csv')
#



BASE SETTINGS
Total Cases Tested: 15
Lowest number in 2x2 table: 15
Highest number in 2x2 table: 251
Lowest p-value: 0.000999
Highest p-value: 0.05

CORRELATIONS
Correlation between UFI and pv: -0.84341 p-value: 8e-05
Correlation between FQ and pv: -0.79092 p-value: 0.00045
Correlation between RRI (exact RR index) and pv: -0.3548 p-value: 0.19441
Correlation between pRRI (percent RR index) and pv: -0.22459 p-value: 0.42096
Correlation between UFI and sample size: 0.26227 p-value: 0.34501
Correlation between FQ and sample size: -0.39167 p-value: 0.14881
Correlation between RRI and sample size: 0.76182 p-value: 0.00096
Correlation between pRRI and sample size: -0.78203 p-value: 0.00057

AVERAGES, STANDARD DEVIATIONS,COEFFICIENTS OF VARIATION AND RANGE
Average PV, STD, CV (minimum, maximum): 0.02288 , 0.01426 , 62.3 % ( 0.00547 , 0.04836 )
Average Sample Size of Each Case, STD, CV (minimum, maximum): 353.7 , 130.4 , 36.9 % ( 126.0 , 637.0 )
Average UFI, STD, CV (minimum, maximum): 2.200 ,

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>