In [1]:
import altair as alt
import pandas as pd
import statsmodels as sm
from scipy import stats
import numpy as np

## Generalized Correlations Visualizations 
Generalized support for Perason's Spearman's Rho, Kendall's Tau, and Point Biserial tests

In [2]:
def correlations(test, var_x, var_y):
    x = data[var_x]
    y = data[var_y]
    df = len(x) - 2 # Determine degrees of freedom 

    r_val = None
    p_val = None 
    corr = ""

    if test == 'pearson':
        r_val, p_val = stats.pearsonr(x, y)
        corr = "Peason's correlation "
    elif test == 'spearman':
        r_val, p_val = stats.spearmanr(x, y)
        corr = "Spearman's Rho correlation "
    elif test == 'kendall':
        r_val, p_val = stats.kendalltau(x, y)
        corr = "Kendall's Tau correlation "
    elif test == 'pointbiserial':
        r_val, p_val = stats.pointbiserialr(x, y)
        corr = "Point Biserial correlation "
    else:
        print("Invalid argument for 'test'")
        print("Valid options include 'person', 'spearman', 'kendall', and 'pointbiserial")
        return

    # Determine significance based on the p value 
    significance = "did not"
    if p_val < 0.05:
        significance = "did"

    # Determine the confidence interval 
    ci = []
    if test == 'kendall' or test == 'spearman':
        stderr = 1.0 / np.sqrt(len(x) - 3)
        delta = stderr * 1.96
        ci = np.tanh(np.arctanh(r_val) - delta), np.tanh(np.arctanh(r_val) + delta)

    # For Pearson/Point Biserial, the r_val must be translated to a Fisher's z_score, then 
    # converted back 
    else:
        r_z = np.arctanh(r_val)
        stdev = 1/np.sqrt(len(x)-3)
        alpha = 0.05
        z = stats.norm.ppf(1-alpha/2)
        ci = np.tanh((r_z-z*stdev, r_z+z*stdev))

    p_string = "p = " + "{0:.3f}".format(p_val).lstrip('0')
    if p_val < 0.001:
        p_string = "p < .001"

    t_string = "{0:.2f}".format(abs(r_val)).lstrip('0')
    if r_val < 0:
        t_string = "-" + t_string
    t_string = "r(" + str(df) + ") = " + t_string

    ci_1_string = "{0:.2f}".format(abs(ci[0])).lstrip('0')
    if ci[0] < 0:
        ci_1_string = "-" + ci_1_string 
    ci_2_string = "{0:.2f}".format(abs(ci[1])).lstrip('0')
    if ci[1] < 0:
        ci_2_string = "-" + ci_2_string 

    ci_string = "[" + ci_1_string + ", " + ci_2_string + "]"

    print("The " + corr + significance + " detect a significant correlation between " \
        + str(var_x) + " and " + str(var_y) + ", " + t_string +  ", " \
        + ci_string + ", " + p_string)

In [4]:
# Load data
data = pd.read_csv("https://homes.cs.washington.edu/~emjun/tea-lang/datasets/liar.csv")

# Execute statistical test
var_x = "Position"
var_y = "Creativity"
correlations('spearman', var_x, var_y)

var_x = "Creativity"
var_y = "Position"
correlations('pearson', var_x, var_y )

var_x = "Creativity"
var_y = "Position"
correlations('pointbiserial', var_x, var_y )

var_x = "Novice"
var_y = "Position"
correlations('kendall', var_x, var_y )

The Spearman's Rho correlation did detect a significant correlation between Position and Creativity, r(66) = -.37, [-.56, -.15], p = .002
The Peason's correlation did detect a significant correlation between Creativity and Position, r(66) = -.31, [-.51, -.07], p = .011
The Point Biserial correlation did detect a significant correlation between Creativity and Position, r(66) = -.31, [-.51, -.07], p = .011
The Kendall's Tau correlation did not detect a significant correlation between Novice and Position, r(66) = .10, [-.14, .33], p = .374
