In [1]:
from scipy import stats

# A run through of the libraries in Python for statistical Testing

1. There are probably MULTIPLE libraries in Python to get these test statisics
2. This is just a collection of some of them, chosen by no other method other than: It was the first search result that came up on Google

In [2]:
# single sample 2-tail t test

## Data is from a normal distribution with a mean of 3, and stdev of 2
data = stats.norm.rvs(size=25,loc=3,scale=2)


#pop mean is the population mean, that is known.
## You can play with this population mean and see the corresponding p value
t_statistic, p_value = stats.ttest_1samp(a=data, popmean=0, alternative = "two-sided")
print(f"the t-test statistic: {t_statistic} with a correlated p_value of {p_value}")

the t-test statistic: 8.436499294015267 with a correlated p_value of 1.2168091967036445e-08


In [3]:
# single sample 1-tail t test

# basically the same as above, but you pass in the parameter of alternative
## values: "two-sided", "less", "greater"
## Documentation here: 
## https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_1samp.html

t_statistic, p_value = stats.ttest_1samp(a=data, popmean=3, alternative = "less")

print(f"the t-test statistic: {t_statistic} with a correlated p_value of {p_value}")

the t-test statistic: -0.02789364984532831 with a correlated p_value of 0.48898881255660004


In [4]:
# two sample t test, 2 tail
data_1 = stats.norm.rvs(size=25,loc=3,scale=2)
data_2 = stats.norm.rvs(size=25,loc=3,scale=2)

stats.ttest_ind(a=data_1, b=data_2)

Ttest_indResult(statistic=0.9283375697745784, pvalue=0.3578774409720028)

In [7]:
# Like with t test, you change the tail with the parameter "alternative"

data_1 = stats.norm.rvs(size=25,loc=10,scale=2)
data_2 = stats.norm.rvs(size=25,loc=3,scale=2)

# alternative hypothesis: (mean)data_1 > (mean)data_2
stats.ttest_ind(a=data_1, b=data_2, alternative = "greater")

Ttest_indResult(statistic=11.27075517981074, pvalue=2.1852464370855266e-15)

# Z Test

In a different library!

Documentation here: https://www.statsmodels.org/dev/generated/statsmodels.stats.weightstats.ztest.html

In [8]:
# using the data object from above
from statsmodels.stats.weightstats import ztest

data = stats.norm.rvs(size=100,loc=3,scale=2)

#1 sample z test
## value => population mean.
## alternative => 1 of ["two-sided", "larger", "smaller"]

ztest_score, p_value= ztest(data, value = 0, alternative='two-sided')
print(f"the z-test statistic: {ztest_score} with a correlated p_value of {p_value}")

the z-test statistic: 14.717101475919792 with a correlated p_value of 5.006641129569728e-49


In [9]:
# 2 sample z test
data_2 = stats.norm.rvs(size=100,loc=10,scale=2)
ztest_score, p_value= ztest(data, data_2, alternative='two-sided')

print(f"the z-test statistic: {ztest_score} with a correlated p_value of {p_value}")

the z-test statistic: -23.338852532717205 with a correlated p_value of 1.7886373209876717e-120


# ANOVA

In [10]:
# one way ANOVA
## documentation here: 
## https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html

from scipy.stats import f_oneway

# Student grades
# Groups: Grade level
performance1 = [89, 89, 88, 78, 79]
performance2 = [93, 92, 94, 89, 88]
performance3 = [89, 88, 89, 93, 90]
performance4 = [81, 78, 81, 92, 82]
 
# Conduct the one-way ANOVA
# Can take as many groups as you want to compare
f_oneway(performance1, performance2, performance3, performance4)

F_onewayResult(statistic=4.625000000000002, pvalue=0.016336459839780215)

In [11]:
# two way anova
import pandas as pd # we'll use a dataframe to track all of the confounding factors
import numpy as np

# Fertilizer: how frequently plant is watered
# watering: how frequently each plant was watered that is daily or weekly.
# height: the height of each plant (in inches) after six months.

dataframe = pd.DataFrame({'Fertilizer': np.repeat(['daily', 'weekly'], 15),
                          'Watering': np.repeat(['daily', 'weekly'], 15),
                          'height': [14, 16, 15, 15, 16, 13, 12, 11, 14, 
                                     15, 16, 16, 17, 18, 14, 13, 14, 14, 
                                     14, 15, 16, 16, 17, 18, 14, 13, 14, 
                                     14, 14, 15]})

In [13]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
  
# Performing two-way ANOVA
# This line creates an OLS object which specifies by string
# https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html

# there's probably a cleaner way of doing this
model = ols('height ~ C(Fertilizer) + C(Watering) +\
C(Fertilizer):C(Watering)',
            data=dataframe).fit()


result = sm.stats.anova_lm(model, type=2)

In [14]:
# P Values in the "PR(>F)" column
result

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Fertilizer),1.0,0.033333,0.033333,0.012069,0.913305
C(Watering),1.0,0.000369,0.000369,0.000133,0.990865
C(Fertilizer):C(Watering),1.0,0.040866,0.040866,0.014796,0.904053
Residual,28.0,77.333333,2.761905,,


# Chi Square Test

Documentation
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

In [17]:
from scipy.stats import chi2_contingency
 
# defining the table
data = [[150, 170], [60, 250]]

test_stat, p, degrees_o_freedom, expected_frequencies = chi2_contingency(data)

print(test_stat, p, degrees_o_freedom)

# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

52.43305191532259 4.4516826225444905e-13 1
p value is 4.4516826225444905e-13
Dependent (reject H0)


# Fisher Exact Test

Documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html

In [18]:

# importing packages
import scipy.stats as stats
  
# creating data
data = [[2, 8], [7, 3]]
  
# performing fishers exact test on the data
odd_ratio, p_value = stats.fisher_exact(data)
print('odd ratio is : ' + str(odd_ratio))
print('p_value is : ' + str(p_value))

odd ratio is : 0.10714285714285714
p_value is : 0.06977851869492736


# Sign Test

Documentation
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html

In [21]:
from scipy.stats import wilcoxon

# each element is the different in scores for 1 student 
# before having coaching and after having coaching

# example d[0] = 10 means the student gained 
# 10 points on a test AFTER coaching compared with before

# Alternative Hypothes
grades_before = [ 75, 75, 75, 60, 62]
grades_after = [65, 65, 65, 61, 59]

diff_after_before = [grades_after[i] - grades_before[i] for i in range(0, len(grades_before))]
                     

res = wilcoxon(diff_after_before)
res.statistic, res.pvalue

(1.0, 0.125)

# Kruskal Wallis Test

Documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html

In [24]:
x = [1, 3, 5, 7, 9]
print(sum(x))
y = [2, 4, 6, 8, 10]
print(sum(y))
stats.kruskal(x, y, x, y)

25
30


KruskalResult(statistic=0.5757575757575737, pvalue=0.9019589941867333)

# Mann Whitney Wilcoxon Test

Documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html

In [None]:
# some rankings
males = [19, 22, 16, 29, 24]
females = [20, 11, 17, 12]

In [None]:
U1, p = stats.mannwhitneyu(males, females, method="exact")
print(U1, p)

# Run Test

Documentation:
1-sample: https://www.statsmodels.org/stable/generated/statsmodels.sandbox.stats.runs.runstest_1samp.html

2-sample: https://www.statsmodels.org/stable/generated/statsmodels.sandbox.stats.runs.runstest_2samp.html

In [25]:
from statsmodels.sandbox.stats.runs import runstest_1samp 

#create dataset
times_series_data = [12, 16, 16, 15, 14, 18, 19, 21, 13, 13]

#Perform Runs test on 1 sample
runstest_1samp(times_series_data, correction=False)

(-0.6708203932499369, 0.5023349543605021)

(-0.6708203932499369, 0.5023349543605021)

# Regression Analysis

There are several types of regression analysis:
https://www.statsmodels.org/stable/api.html#regression

Let's go look at the docs and see what we have!

1. Regression analysis is built off of Linear Regression which is an ML technique to find the best fit line.  We will cover how this works soon!