In [1]:
import altair as alt
import pandas as pd
import statsmodels as sm
from scipy import stats
import numpy as np

## Fisher Exact Test

In [13]:
def fisher_exact(var_x, var_y, data):
    var_0_groups = data[var_x].unique()
    var_1_groups = data[var_y].unique()

    contingency_table = []
    for group_0 in var_0_groups:
        table_row = []
        for group_1 in var_1_groups:
            data_new = data[var_y][(data[var_x] == group_0) & (data[var_y] == group_1)]
            table_row.append(len(data_new))

        contingency_table.append(table_row)

    odds, p_val = stats.fisher_exact(contingency_table)

    # Calculate expected frequencies --> heatmap vals 
    # Consider raw counts as well

    significance = "no"
    if p_val < 0.05:
        significance = "an"

    p_string = "p = " + "{0:.3f}".format(p_val).lstrip('0')
    if p_val < 0.001:
        p_string = "p < .001"

    print("The Fisher Exact test found " + significance + " association between " \
            + str(var_x) + " and " + str(var_y) + ", " + p_string)

In [12]:
# Load data
data = pd.read_csv("https://homes.cs.washington.edu/~emjun/tea-lang/datasets/catsData.csv")

# Execute statistical test
var_x = "Training"
var_y = "Dance"
results = fisher_exact(var_x, var_y, data)

1.3117092613005282e-06
The Fisher Exact test found an association between Training and Dance, p < .001


### Visualizations

In [35]:
# 2D Heat Map
alt.Chart(data).mark_rect().encode(
    x='Training:N',
    y='Dance:N',
    color='results:Q'
)