In [1]:
import altair as alt
import pandas as pd
import statsmodels as sm
from scipy import stats
import numpy as np

## Point Biserial

In [3]:
def point_biserial(var_x, var_y):
    x = data[var_x]
    y = data[var_y]
    df = len(x) - 2 # Determine degrees of freedom 
    r_val, p_val = stats.pointbiserialr(x, y)

    significance = "did not"
    if p_val < 0.05:
        significance = "did"

    # Calculate confidence interval
    r_z = np.arctanh(r_val)
    stdev = 1/np.sqrt(len(x)-3)
    alpha = 0.05
    z = stats.norm.ppf(1-alpha/2)
    ci = np.tanh((r_z-z*stdev, r_z+z*stdev))

    p_string = "p = " + "{0:.3f}".format(p_val).lstrip('0')
    if p_val < 0.001:
        p_string = "p < .001"

    r_string = "{0:.2f}".format(abs(r_val)).lstrip('0')
    if r_val < 0:
        r_string = "-" + r_string
    r_string = "r(" + str(df) + ") = " + r_string

    ci_1_string = "{0:.2f}".format(abs(ci[0])).lstrip('0')
    if ci[0] < 0:
        ci_1_string = "-" + ci_1_string 
    ci_2_string = "{0:.2f}".format(abs(ci[1])).lstrip('0')
    if ci[1] < 0:
        ci_2_string = "-" + ci_2_string 

    ci_string = "[" + ci_1_string + ", " + ci_2_string + "]"

    print("The Point Biserial correlation " + significance + " detect a significant correlation between " \
            + str(var_x) + " and " + str(var_y) + ", " + r_string +  ", " \
            + ci_string + ", " + p_string)

In [4]:
# Load data
data = pd.read_csv("https://homes.cs.washington.edu/~emjun/tea-lang/datasets/pbcorr.csv")

# Execute statistical test
var_x = "time"
var_y = "gender"
point_biserial(var_x, var_y)

var_x = "gender"
var_y = "time"
point_biserial(var_x, var_y )

var_x = "time"
var_y = "recode"
point_biserial(var_x, var_y )

The Point Biserial correlation did detect a significant correlation between time and gender, r(58) = .38, [.14, .58], p = .003
The Point Biserial correlation did detect a significant correlation between gender and time, r(58) = .38, [.14, .58], p = .003
The Point Biserial correlation did detect a significant correlation between time and recode, r(58) = -.38, [-.58, -.14], p = .003


### Visualizations

In [7]:
# Vertical Bars 
alt.Chart(data).mark_bar(size=35).encode(
    x='gender:N',
    y='time',
).properties(
    width = 300
)

In [10]:
# Horizontal Bars 
alt.Chart(data).mark_bar(size=35).encode(
    y='gender:N',
    x='time',
).properties(
    height = 300
)

In [15]:
# Scatterplot
alt.Chart(data).mark_circle(size=60).encode(
    x='gender:N',
    y='time',
    tooltip=['gender', 'time']
).properties(
    width = 200
).interactive()


In [16]:
# Parallel Coordinates
alt.Chart(data).transform_window(
    index='count()'
).transform_fold(
    ['time', 'gender']
).mark_line().encode(
    x='key:N',
    y='value:Q',
    detail='index:N',
    opacity=alt.value(0.5)
).properties(width=500)

In [17]:
# 2D Heat Map
alt.Chart(data).mark_rect().encode(
    x='gender:N',
    y='recode:N',
    color='time:Q'
)

In [17]:
# 2D Size Map
alt.Chart(data).mark_circle().encode(
    x='gender:N',
    y='recode:N',
    size='time:Q'
).properties(
    height = 200,
    width = 200
)