In [None]:
# CONFIDENCE INTERVALS

In [None]:
# EXERCISE 1. What is the normal body temperature for healthy humans? 
# A random sample of 130 healthy human body temperatures provided by Allen Shoemaker 
# yielded 98.25 degrees and standard deviation 0.73 degrees. 
# Give a 99% confidence interval for the average body temperature of healthy people.

In [4]:
import numpy as np
from scipy import stats

In [5]:
mean = 98.25
standard_deviation = 0.73
sample_size = 130
sem = standard_deviation / np.sqrt(sample_size)
alpha = 0.01

In [6]:
stats.norm.interval(confidence=0.99, loc=mean, scale=sem)

(98.08508192246582, 98.41491807753418)

In [None]:
# EXERCISE 2. The administrators for a hospital wished to estimate the average number of days 
# required for inpatient treatment of patients between the ages of 25 and 34. 
# A random sample of 500 hospital patients between these ages produced 
# a mean and standard deviation equal to 5.4 and 3.1 days, respectively.
# Construct a 95% confidence interval for the mean length of stay for the population of patients from which the sample was drawn.

In [7]:
mean = 5.4
standard_deviation = 3.1
sample_size = 500
sem = standard_deviation / np.sqrt(sample_size)
alpha = 0.05

In [8]:
stats.norm.interval(confidence=0.95, loc=mean, scale=sem)

(5.12827801242126, 5.67172198757874)

In [None]:
# HYPOTHESIS TESTING

In [None]:
EXERCISE 3. The hourly wages in a particular industry are normally distributed with mean $13.20 and standard deviation $2.50. 
A company in this industry employs 40 workers, paying them an average of $12.20 per hour. 
Can this company be accused of paying substandard wages? 
Use an α = .01 level test. (Wackerly, Ex.10.18)
CHECK: statistic: -2.5298221281347035, pvalue= 0.005706018193000826

In [55]:
alpha = 0.01
zt = (12.20 - 13.20)/(2.5/np.sqrt(40))  # z-statistic for mean
p_value = scipy.stats.norm.sf(abs(zt)) # one-sided pvalue = Prob(abs(z)>zt)
print((zt, pval))  # The result is significant at p < .01.

(-2.5298221281347035, 0.005706018193000826)


In [56]:
if p_value < alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.01 level of significance, we can reject the null hypothesis in favor of alternative hypothesis.


In [None]:
# EXERCISE 4.Shear strength measurements derived from unconfined compression tests for two types of soils gave 
# the results shown in the following document (measurements in tons per square foot). 
# Do the soils appear to differ with respect to average shear strength, at the 1% significance level?
# CHECK: statistic: 5.1681473319343345, pvalue= 2.593228732352821e-06

In [40]:
import pandas as pd
df0 = pd.read_excel('soil.xlsx')

In [41]:
df = df0.copy()

In [42]:
df.head()

Unnamed: 0,Soil1,Soil2
0,1.442,1.364
1,1.943,1.878
2,1.11,1.337
3,1.912,1.828
4,1.553,1.371


In [45]:
df.Soil1.count()

30

In [62]:
tt = stats.ttest_ind_from_stats(mean1=df.Soil1.mean(), 
                           std1=df.Soil1.std(), 
                           nobs1=df.Soil1.count(),
                           mean2=df.Soil2.mean(), 
                           std2=df.Soil2.std(),      
                           nobs2=df.Soil2.count())
tt

Ttest_indResult(statistic=5.168147331934331, pvalue=2.5932287323528494e-06)

In [68]:
alpha = 0.01
p_val = tt[1]
p_val

2.5932287323528494e-06

In [69]:
if p_val < alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.01 level of significance, we can reject the null hypothesis in favor of alternative hypothesis.


In [None]:
# EXERCISE 5. The following dataset is based on data provided by the World Bank 
# (https://datacatalog.worldbank.org/dataset/education-statistics). 
# World Bank Edstats.  2015 PISA Test Dataset

# Get descriptive statistics (the central tendency, dispersion and shape of a dataset’s distribution)
# for each continent group (AS, EU, AF, NA, SA, OC).

# Determine whether there is any difference (on the average) for the math scores among European (EU) and Asian (AS) countries
# (assume normality and equal variances). 
# Draw side-by-side box plots.

# CHECK: statistic=0.870055317967983, pvalue=0.38826888111307345

In [71]:
df00 = pd.read_excel('2015 PISA Test.xlsx')

In [72]:
df1 = df00.copy()

In [73]:
df1.head()

Unnamed: 0,Country Code,Continent_Code,internet_users_per_100,Math,Reading,Science
0,ALB,EU,63.252933,413.157,405.2588,427.225
1,ARE,AS,90.5,427.4827,433.5423,436.7311
2,ARG,SA,68.043064,409.0333,425.3031,432.2262
3,AUS,OC,84.560519,493.8962,502.9006,509.9939
4,AUT,EU,83.940142,496.7423,484.8656,495.0375


In [74]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
internet_users_per_100,70.0,71.973099,16.390632,21.976068,60.89902,72.99935,85.026763,98.2
Math,70.0,460.971557,53.327205,327.702,417.416075,477.60715,500.482925,564.1897
Reading,70.0,460.997291,49.502679,346.549,426.948625,480.19985,499.687475,535.1002
Science,70.0,465.439093,48.397254,331.6388,425.923375,475.40005,502.43125,555.5747


In [75]:
df1.shape

(70, 6)

In [97]:
n = df1.shape[0]
n

70

In [86]:
df1["Continent_Code"].value_counts()

EU    37
AS    17
SA     7
OC     2
AF     2
Name: Continent_Code, dtype: int64

In [105]:
n1 = df1["Continent_Code"].value_counts().values[0]
n1

37

In [106]:
n2 = df1["Continent_Code"].value_counts().values[1]
n2

17

In [98]:
df2 = df1[df1["Continent_Code"] == "EU"]
mean1 = df2.Math.mean()
mean1

477.98144864864867

In [102]:
std1 = df2.Math.std()
std1

35.150402627270616

In [99]:
df3 = df1[df1["Continent_Code"] == "AS"]
mean2 = df3.Math.mean()
mean2

466.2166470588236

In [103]:
std2 = df3.Math.std()
std2

64.3564901327764

In [101]:
sigma = df1.Math.std()

53.32720548161262

In [109]:
tt = stats.ttest_ind_from_stats(mean1= mean1, 
                           std1=std1, 
                           nobs1=n1,
                           mean2=mean2     , 
                           std2=std2,      
                           nobs2=n2)
tt

Ttest_indResult(statistic=0.8700553179679748, pvalue=0.3882688811130779)

In [112]:
alpha = 0.01
p_val2 = tt[1]
p_val2

0.3882688811130779

In [113]:
if p_val2 < alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.01 level of significance, we fail to reject the null hypothesis.
