In [6]:
#First we always import all our libraries 
import numpy as np
import pandas as pd
import scipy as stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
%matplotlib inline

##This line is what data munging is. 
dataHowell = pd.read_excel('Howell_2.4.xls')
#print(dataHowell)

In [13]:
##Let's run an independent t-test using the dataHowell data. 

#We need to import from scipy to run the t-test function.
from scipy import stats

#store the dataHowell data in an array, t-tests don't like parsed data for some reason. 
ChildrenArray = np.array(dataHowell['children'])
AdultsArray = np.array(dataHowell['adults'])

#Let's do Levene's normality test.
#data is normal if it is bigger than 0.05 after running the Levene test.
#the way each of these arrays is set up they will only display the p-value.
#if you want to view the statistic get rid of the [1] in the array.
Normality = stats.levene(ChildrenArray, AdultsArray)[1]
print("Levene Normality test", Normality)

##perform an independent t-test as these are two unrealted groups.
##stats.ttest_ind is the function for performing the independent t-test. 

#so lets run the independent t-test
IndependentTTest = stats.ttest_ind(ChildrenArray, AdultsArray, equal_var=False)[1]
#this will output the p-value
print("Independent t-test", IndependentTTest)

#let's pretend that we need to run a dependent t-test on dataHowell, even though we know it should be independent. 
#so lets run a dependent t-test
DependentTTest = stats.ttest_rel(ChildrenArray, AdultsArray)[1]
print("Dependent t-test", DependentTTest)

#Now lets pretend that the data for children is non-parametric so we have to run a one sample t-test.
#lets use 12 as the population mean
OneSampleTTest = stats.ttest_1samp(ChildrenArray, 12)[1]
print("One Sample t-test", OneSampleTTest)

Levene Normality test 0.8706741479402627
Independent t-test 3.1649249519095217e-18
Dependent t-test 1.6932758799562023e-16
One Sample t-test 1.24463015539893e-14


In [10]:
##Let's fine median. mean and mode

#Medians
ChildrenMedian = ([np.median(dataHowell['children'])]) 
AdultsMedian = ([np.median(dataHowell['adults'])])
#How we print the medians
print("Median of Children", ChildrenMedian)
print("Median of Adults", AdultsMedian)

#Means
ChildrenMean = ([np.mean(dataHowell['children'])])
AdultsMean = ([np.mean(dataHowell['adults'])])
#How we print the medians
print("Mean of Children", ChildrenMean )
print("Mean of Adults", AdultsMean)

#Modes
AdultsMode = stats.mode(dataHowell['adults'])[1]
ChildrenMode = stats.mode(dataHowell['children'])[1]
#How we print the modes
print("Mode of Children", ChildrenMode)
print("Mode of Adults", AdultsMode)

Median of Children [18.0]
Median of Adults [10.0]
Mean of Children [18.9]
Mean of Adults [10.2]
Mode of Children [10]
Mode of Adults [8]


In [None]:
#Let's plot some graphs using the dataHowell data. 

#Here we're making a bar graph
print("dataHowell bar graph")

#This is the mean of both vectors
mean=(dataHowell.mean())
#Thisis the errorbar of both vectors
std=(dataHowell.std())
#Let's define the length of dataHowell. We're defining it as n because n is what we call sample size in stats. 
n=len(dataHowell.std()) 

#let's draw boxes with their mean and variability error
plt.bar(np.arange(n), mean , yerr=std)

#let's label the x axis
plt.xticks(np.arange(n), ('children', 'adults'))

#lets label the y axis
plt.ylabel('phrases')

#this hides the plotting info we don't want to show
plt.show()

In [None]:
#Now let's do box plots for dataHowell
print("dataHowell box plots")

#here we are drawing boxes children and adults
plt.boxplot([dataHowell['children'],dataHowell['adults']]) 

#let's label y axe
plt.ylabel('phrases')

#let's label each box x axe
plt.xticks([1 ,2],["children","adults"])

#this hides the plotting info we don't want to show
plt.show()

In [None]:
#Now let's do a scatter plot for dataHowell
print("dataHowell scatter plot")

#This is just for asigning random colors
N = 50
colors = np.random.rand(N)

#here we are drawing scatter plots children and adults
plt.scatter((dataHowell['children']),(dataHowell['adults']), c=colors, alpha=0.5)

#label our axises
plt.xlabel("children")
plt.ylabel("adults")

#this hides the plotting info we don't want to show
plt.show()

In [None]:
#Now lets do a histrogram with dataHowell
print("dataHowell histrogram")

#here's how we plot a histogram with children and adults together
plt.hist([dataHowell['adults'],dataHowell['children']],42,range=[0,42])

#now we label the axis
plt.xlabel('phrases')
plt.ylabel('amount')

#this hides the plotting info we don't want to show
plt.show()

In [None]:
#Now let's use the dataHowell data to calculate the 95% Confidence Interval. 

#Here we will define the length as nChildren since this is our sample size for children. 
nChildren = len(dataHowell['children'])
#Here we will define the length as nAdults since this is our sample size for adults. 
nAdults =  len(dataHowell['adults'])

#This is the mean of the sample children
mean_sampleC = np.mean(dataHowell['children'])
#This is the mean of the sample adults
mean_sampleA = np.mean(dataHowell['adults'])

#Here we are getting the standard of error "sem" for sample children
##by divding the standard devation and degrees of freedom "ddof" by the square root of the sample size 
##note: change degrees of freedom to the actual number in children
sem = np.std(dataHowell['children'],ddof=1)/math.sqrt(nChildren)

#here we are defining the lower range for the children sample.
#I think these ranges are fixed if we are using 95% CI, regardless of our data
lower_range=stats.t(nChildren).ppf(0.025)*sem
#here we are defining the upper range for the children sample
upper_range=stats.t(nChildren).ppf(0.975)*sem

#Now here is where we are actually calucating the confidence interval for the children sample
confidence_interval_sampleC = [mean_sampleC + lower_range, mean_sampleC + upper_range]

#Here we are getting the standard of error "sem" for sample adults
sem = np.std(dataHowell['adults'],ddof=1)/math.sqrt(nAdults)

#here we are defining the lower range for the adult sample.
lower_range=stats.t(nAdults).ppf(0.025)*sem
#here we are defining the upper range for the adult sample.
upper_range=stats.t(nAdults).ppf(0.975)*sem

#And here we are calucating the confidence interval for the adult sample
confidence_interval_sampleA = [mean_sampleA + lower_range, mean_sampleA + upper_range]

#Here we are printing the means
print(mean_sampleC)
print(mean_sampleA)

#And here we are printing the confidence intervals 
print(confidence_interval_sampleC)
print(confidence_interval_sampleA)

#And finally here we are running the stats pearson to ge the degrees of signifigance
degree_of_sig = stats.pearsonr((dataHowell['children']),(dataHowell['adults']))[1]
print(degree_of_sig)


In [24]:
#Now lets run the Wilcoxon

#We need to import from scipy to run the wilcoxon.
from scipy.stats import wilcoxon

#here are the arrays we will be drawing from for the wilcoxon and the man whitney.
#we are going to pretend that this data is non-parametric for the sake of this exercise.
#the wilcoxon is used for the non-parametric equivolent of a one sample-test or a dependent t-test.
ChildrenArray = np.array(dataHowell['children'])
AdultsArray = np.array(dataHowell['adults'])

#now run the wilcoxon here.
Result = wilcoxon(ChildrenArray)[1]
print ("Wilcoxon Test Results", Result)

#While we're at it why don't we run the Man Whitney, just for fun
#the man whitney is the non-parametric equivolent of the independent t-test for non-parametric data. 
Result = stats.mannwhitneyu(ChildrenArray, AdultsArray)[1]
print (" Man Whitney Test", Result)

Wilcoxon Test Results 7.008375735542048e-10
 Man Whitney Test 3.853821076267207e-16


In [31]:
##Here we're going to run the Spearman corrolation using data that I made up. 
#We will assume that the data is non-parametric for this example because we have a small sample size. 
#We need continues data to run the Spearman test, so we will use weight as it corrolates to height. 

#Here we are importing numpy random as we need to use random since we don't have real data
from numpy.random import rand
#Here we are importing spearman from the scipy library
from scipy.stats import spearmanr

#We are running the spearman test to see if there is a corrolation between height and weight. 
##Height is listed in meters, weight in kilos.

#here we define height
Height = np.array([1.65, 2.0, 1.87, 1.44, 1.03, 0.94, 1.36])
#here we define weight
Weight= np.array([65, 80 , 58, 91, 57, 46, 52])

#calculate spearman's correlation
coef, p = spearmanr(Height, Weight)
print('Spearmans correlation coefficient: %.3f' % coef)

#interpret the significance
alpha = 0.05

if (p > alpha) or (p  == alpha):
	print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('Samples are correlated (reject H0) p=%.3f' % p)

Spearmans correlation coefficient: 0.714
Samples are uncorrelated (fail to reject H0) p=0.071
