In [49]:
# Задача. Провести дисперсионный анализ для определения того, есть ли различия среднего роста среди взрослых футболистов, 
# хоккеистов и штангистов. Даны значения роста в трех группах случайно выбранных спортсменов:
# Футболисты: 173, 175, 180, 178, 177, 185, 183, 182.
# Хоккеисты: 177, 179, 180, 188, 177, 172, 171, 184, 180.
# Штангисты: 172, 173, 169, 177, 166, 180, 178, 177, 172, 166, 170.

In [50]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [51]:
# Критерий Фишера
# Истинный уровень статистической значимости:
alpha = 1 - (1 - 0.05)**3
alpha

0.1426250000000001

In [52]:
football = np.array([173, 175, 180, 178, 177, 185, 183, 182])
hockey = np.array([177, 179, 180, 188, 177, 172, 171, 184, 180])
weightlifters = np.array([172, 173, 169, 177, 166, 180, 178, 177, 172, 166, 170])

In [53]:
k = 3
n = football.shape[0] + hockey.shape[0] + weightlifters.shape[0]
n

28

In [54]:
ftbl_mean = football.mean()
hcky_mean = hockey.mean()
wtlt_mean = weightlifters.mean()
ftbl_mean, hcky_mean, wtlt_mean

(179.125, 178.66666666666666, 172.72727272727272)

In [55]:
total = np.array([173, 175, 180, 178, 177, 185, 183, 182, 177, 179, 180, 188, 177, 172, 171, 184, 180, 172, 173, 169, 177, 166, 180, 178, 177, 172, 166, 170])
# total = np.array([football, hockey, weightlifters])
total_mean = total.mean()
total_mean

176.46428571428572

In [56]:
S_f = np.sum((ftbl_mean - total_mean)**2) * football.shape[0] + np.sum((hcky_mean - total_mean)**2) * hockey.shape[0] + \
      np.sum((wtlt_mean - total_mean)**2) * weightlifters.shape[0]
S_f

253.9074675324678

In [57]:
S_ost = np.sum((football - ftbl_mean)**2) + np.sum((hockey - hcky_mean)**2) + np.sum((weightlifters - wtlt_mean)**2)
S_ost

577.0568181818182

In [58]:
D_f = S_f / k - 1
D_ost = S_ost / (n - k)
D_f, D_ost

(83.63582251082259, 23.08227272727273)

In [59]:
F_n = D_f / D_ost
F_n

3.6233790103347645

In [61]:
# k1 = 3 - 1 = 2
# k2 = 28 - 3 = 25
F_cr = 3.38
# F_cr < F_n => Отклоняем H0: различия в среднем росте есть.

In [62]:
# Проверка
f = stats.f_oneway(football, hockey, weightlifters)
f

F_onewayResult(statistic=5.500053450812596, pvalue=0.010482206918698694)

In [63]:
# p-value < alpha => различия есть

In [64]:
# Post hoc test Tukey
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import pandas as pd

In [65]:
df = pd.DataFrame({'height': [173, 175, 180, 178, 177, 185, 183, 182,
                             177, 179, 180, 188, 177, 172, 171, 184, 180,
                             172, 173, 169, 177, 166, 180, 178, 177, 172, 166, 170],
                  'group': np.repeat(['football', 'hockey', 'weightlifters'],[8,9,11])})
tukey = pairwise_tukeyhsd(endog = df['height'],
                         groups = df['group'],
                         alpha = 0.05)
print(tukey)

     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
 group1      group2    meandiff p-adj   lower    upper  reject
--------------------------------------------------------------
football        hockey  -0.4583  0.979  -6.2732  5.3566  False
football weightlifters  -6.3977 0.0219 -11.9583 -0.8372   True
  hockey weightlifters  -5.9394 0.0284 -11.3181 -0.5607   True
--------------------------------------------------------------


In [66]:
# Отклоняем H0 во второй и третьей строке => есть различия между футболистами и штангистами, и между хоккеистами и штангистами.