In [101]:
from scipy.stats import binom
from scipy.stats import poisson
from scipy.stats import chi, chi2

import scipy.stats as st

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

## Question 1
The following table indicates the number of 6-point scores in an American rugby match in the 1979 season.

![](table1.png)

Based on these results, we create a Poisson distribution with the sample mean parameter  = 2.435. Is there any reason to believe that at a .05 level the number of scores is a Poisson variable?

In [97]:
alpha = 0.05
mean = 2.435
scores = [0,1,2,3,4,5,6,7]
freq_obs_scores = [35,99,104,110,62,25,10,3]
print('Quick Mean Check: ' + str(sum(np.array(scores)*np.array(freq_obs_scores))/sum(freq_obs_scores)))
print('')
freq_expect_scores = [sum(freq_obs_scores) * (((mean**i) * math.exp(-1*mean))/math.factorial(i)) for i in scores] 
freq_expect_scores = freq_expect_scores[:-1] ### fixing the matter of the last category being "or more"
freq_expect_scores.append(sum(freq_obs_scores) - sum(freq_expect_scores))

chi_squared = np.sum(np.divide(np.square(np.subtract(freq_obs_scores, freq_expect_scores)),freq_expect_scores))
print('Chi-squared Stat: ' + str(chi_squared))

dist = chi2(len(freq_obs_scores)-1)
criticalvalue = dist.ppf(1-alpha)

print('Chi-squared Crit: ' + str(criticalvalue))
print('')

if chi_squared > criticalvalue:
    print('We can reject the null. It does not follow the tested distribution.')
else: 
    print("We cannot reject the null. We cannot exclude the possibility of it following the tested distribution.")
    
### I do not understand how can a test like this reach the same conclusion as one in which I would use a full list of 
### scores instead of frequencies... say a list that would be obtained by:

        # Observed_Scores_List = []
        # for i in range(len(freq_scores)):
        #     j = 1
        #     while j <= freq_obs_scores[i]:
        #         Observed_Scores_List.append(scores[i])
        #         j += 1

        # Expected_Scores_List = []
        # for i in range(len(freq_expect_scores)):
        #     j = 1
        #     while j <= dist_0_7[i]:
        #         Expected_Scores_List.append(scores[i])
        #         j += 1
        
### We do use multiply by the total and we do use different dfs... but ...

Quick Mean Check: 2.435267857142857

Chi-squared Stat: 6.491310681109773
Chi-squared Crit: 14.067140449340169

We cannot reject the null. We cannot exclude the possibility of it following the tested distribution.


## BONUS/OPTIONAL - Question 2
Let's analyze a discrete distribution. To analyze the number of defective items in a factory in the city of Medellín, we took a random sample of n = 60 articles and observed the number of defectives in the following table:

![](table2.png)

A poissón distribution was proposed since it is defined for x = 0,1,2,3, .... using the following model:

![](image1.png)

For some extra insights check the following link: https://online.stat.psu.edu/stat504/node/63/ 

Does the distribution of defective items follow this distribution?

In [98]:
### Exact same as above, but used the mean of the sample for generating the expected instead of a mean provided.

# your code here
alpha = 0.05
scores = [0,1,3,4]
freq_obs_scores = [32,15,9,4]

mean = sum(np.array(scores)*np.array(freq_obs_scores))/sum(freq_obs_scores)
print('Quick Mean Check: ' + str(mean))
print('')
freq_expect_scores = [sum(freq_obs_scores) * (((mean**i) * math.exp(-1*mean))/math.factorial(i)) for i in scores] 
freq_expect_scores = freq_expect_scores[:-1] ### fixing the matter of the last category being "or more"
freq_expect_scores.append(sum(freq_obs_scores) - sum(freq_expect_scores))

chi_squared = np.sum(np.divide(np.square(np.subtract(freq_obs_scores, freq_expect_scores)),freq_expect_scores))
print('Chi-squared Stat: ' + str(chi_squared))

dist = chi2(len(freq_obs_scores)-1)
criticalvalue = dist.ppf(1-alpha)

print('Chi-squared Crit: ' + str(criticalvalue))
print('')

if chi_squared > criticalvalue:
    print('We can reject the null. It does not follow the tested distribution.')
else: 
    print("We cannot reject the null. We cannot exclude the possibility of it following the tested distribution.")

Quick Mean Check: 0.9666666666666667

Chi-squared Stat: 20.01608552988212
Chi-squared Crit: 7.814727903251179

We can reject the null. It does not follow the tested distribution.


## Question 3
A quality control engineer takes a sample of 10 tires that come out of an assembly line, and would like to verify on the basis of the data that follows, if the number of tires with defects observed over 200 days, if it is true that 5% of all tires have defects (that is, if the sample comes from a binomial population with n = 10 and p = 0.05). 

![](table3.png)


In [99]:
# your answer here
alpha = 0.05
scores = [0,1,2]
freq_obs_scores = [138,53,9]
n = 10
p = 0.05
freq_expect_scores = []
k = 0
freq_expect_scores.append(200 * (math.factorial(n)/(math.factorial(k)*math.factorial(n-k)))*(p**k)*((1-p)**(n-k)))
k = 1
freq_expect_scores.append(200 * (math.factorial(n)/(math.factorial(k)*math.factorial(n-k)))*(p**k)*((1-p)**(n-k)))
freq_expect_scores.append(sum(freq_obs_scores) - sum(freq_expect_scores))

chi_squared = np.sum(np.divide(np.square(np.subtract(freq_obs_scores, freq_expect_scores)),freq_expect_scores))
print('Chi-squared Stat: ' + str(chi_squared))

dist = chi2(len(freq_obs_scores)-1)
criticalvalue = dist.ppf(1-alpha)

print('Chi-squared Crit: ' + str(criticalvalue))
print('')

if chi_squared > criticalvalue:
    print('We can reject the null. It does not follow the tested distribution.')
else: 
    print("We cannot reject the null. We cannot exclude the possibility of it following the tested distribution.")

Chi-squared Stat: 8.306179519542825
Chi-squared Crit: 5.991464547107979

We can reject the null. It does not follow the tested distribution.


## Question 4
A researcher gathers information about the patterns of Physical Activity of children in the fifth grade of primary school of a public school. He defines three categories of physical activity (Low, Medium, High). He also inquires about the regular consumption of sugary drinks at school, and defines two categories (Yes = consumed, No = not consumed). We would like to evaluate if there is an association between patterns of physical activity and the consumption of sugary drinks for the children of this school, at a level of 5% significance. The results are in the following table: 

![](table4.png)

In [107]:
#your answer here
chi2, p, dof, ex = st.chi2_contingency([[32,12],[14,22],[6,9]])
if p < 0.05:
    print('We can reject the null. There is an association between the two variables.')
else: 
    print("We cannot reject the null. We cannot exclude the possibility of no association being present.")

We can reject the null. There is an association between the two variables.
