In [47]:
import scipy.stats as stats
from scipy.stats import f as f_dist
import numpy as np


# Funtions to get sum squares between and within. ANOVA relies on this 

In [27]:
def SSW(*args):
    args = [arg for arg in args]
    ssw = 0
    for arg in args: 
        group_mean = np.mean(arg)
        val = np.sum((arg - group_mean)**2)
        ssw += val
    return ssw

def SSB(*args):
    args = [arg for arg in args]
    ssb = 0
    alldata = np.concatenate(args)
    grand_mean = np.mean(alldata)
    for arg in args: 
        group_mean = np.mean(arg)
        n = len(arg)
        val = n*((group_mean - grand_mean)**2)
        ssb += val

    return ssb

# SSB FUNTION NOT WORKING; NOT SURE WHY 

In [28]:
# test runs - same means
data1 = np.random.normal(size=20)
data2 = np.random.normal(size=20)
data3 = np.random.normal(size=20)

In [29]:
# different means
d1 = np.random.normal(loc = 10, size=100)
d2 = np.random.normal(loc = 3, size=100)
d3 = np.random.normal(loc = 2, size=100)

In [30]:
SSB(data1, data2, data3)

0.1773561846670917

In [31]:
SSB(d1, d2, d3)

3902.2058520947367

## When args are passed as an array(as in anova funtion), SSB is not working. But SSW is. 

In [33]:
args = [data1, data2, data3]
args2 = [d1, d2, d3]
SSB(*args)

0.1773561846670917

Why is is returning 0? 

In [34]:
SSB(*args2)

3902.2058520947367

In [35]:
SSW(*args)

60.97208980528673

In [36]:
SSW(*args2)

296.4088935960911

In [42]:
def get_sum_square_stats(*args): 
    """
    Parameters
    ----------------------
    args: array-like
        list of groups to be analyzed. 
        
    Returns
    ------------------
    ssw: float
        Sum-of-squares within the arguments. Tells us how much of the variation
        is due to within-argument variation. 
    
    ssb: Sum-of-squares between arguments. Tells us how much of the variation 
        of the groups is from between-group variation. 
    
    sst: Total sum of squares variation for the list of groups. Tells us how much 
        the samples vary in total. 
        sst = ssw + ssb
    """
    ssw = SSW(*args)
    ssb = SSB(*args)
    sst = ssb + ssw
    return ssw, ssb, sst

def get_anova_counts(args): 
    """
    Parameters
    ----------------------
    args: array-like
        list of groups to be counted. 
        
    Returns
    ------------------
    k: int
        number of groups 
    N: int
        total number of observations across all groups
    dfbn: int
        degrees of freedom between arguments; k - 1
    dfwn: int
        degrees of freedom within arguments; n - 1
    """
    k = len(args)
    N = 0
    for i in range(k):
        N += len(args[i])
    dfbn = k - 1
    dfwn = N - k
    
    return k, N, dfbn, dfwn

def finish_anova(f, dfbn, dfwn): 
     """
    Parameters
    ----------------------
    f: float
         calculated f-value. 
         f = mean-square-between groups / mean-square-within groups
         The f-value is the ratio of how much variation is between versus within groups. 
         If the groups means were the same, the between group variation would be zero. 
         As the f-value grows, the more dissimilar the groups means are. 
    dfbn: int
        Degrees of freedom between groups passed to ANOVA funtion. 
    dfwn: int
        Degrees of freedom within groups. 
        
    Returns
    ------------------
    p: float
        p-value calculated from passing gotten f-value and df's into the f-density funtion from scipy. 
        could attempt to manually implement, i.e
        [f(x, df_1, df_2) = (df_2^{df_2/2} df_1^{df_1/2} x^{df_1 / 2-1} / 
        [{(df_2+df_1 x)^{(df_1+df_2)/2}*sc.beta(df_1/2, df_2/2)}\]
        
    return_string: String
        Specifies whether gotten p value implies rejecting or failing to reject the null hypothesis. 
    
    """
    # use scipy to plug f-value into f distribution to return p-value
     p_value = f_dist.sf(dfbn, dfwn, f)
    
     if p_value > .05:
        return_string = "F-value: " + str(f) + ", P-value: " + str(
            p_value) + ", Fail to reject null hypothesis."
     else:
        return_string = "F-value: " + str(f) + ", P-value: " + str(
            round(p_value, 5)) + ", Reject null hypothesis."

     return return_string

In [59]:
# use *args command to accept variable number of arguments
def anova(*args):
    
    k, N, dfbn, dfwn  = get_anova_counts(args)
    
    ssw, ssb, sst = get_sum_square_stats(*args)

    msb = ssb / float(dfbn)
    msw = ssw / float(dfwn)
    f = msb / msw
    
    return finish_anova(f, dfbn, dfwn)
    

   

In [60]:
anova(data1, data2, data3)


'F-value: 0.08290106635928096, P-value: 0.8696529244645572, Fail to reject null hypothesis.'

In [48]:
stats.f_oneway(data1, data2, data3)

F_onewayResult(statistic=0.08290106635928098, pvalue=0.9205529766156326)

In [54]:
stats.f_oneway(*args)

F_onewayResult(statistic=0.08290106635928098, pvalue=0.9205529766156326)

# Big issue right now: SSB, from earlier, is not working and is frustrating me. 

In [14]:
argss = data1, data2, data3
SSB(argss)

0.0