## 0. python imports

In [1]:
import pandas as pd
from scipy.stats import ttest_rel, ttest_1samp, ttest_ind

## 1. data loading

In [2]:
blood_pressure = pd.read_csv('./data/blood_pressure.csv')
blood_pressure.head()

Unnamed: 0,before,after
0,136.713072,92.432965
1,134.735618,105.022643
2,127.529115,82.242766
3,144.527126,93.607172
4,124.21472,103.212223


In [3]:
ab_test = pd.read_csv('./data/ab_test.csv')
ab_test.head()

Unnamed: 0,a,b
0,0.27,13.61
1,6.08,21.53
2,13.74,9.23
3,9.7,5.36
4,7.0,12.9


## 2. hypothesis test example (related samples)

test related distributions, is the differences between them due to chance?

In [4]:
ttest_rel(blood_pressure['before'], blood_pressure['after'])

Ttest_relResult(statistic=27.291841767560236, pvalue=7.303035069608042e-48)

test related distributions as mean difference is zero

In [5]:
blood_pressure['diff'] = blood_pressure['after'] - blood_pressure['before']
blood_pressure.head()

Unnamed: 0,before,after,diff
0,136.713072,92.432965,-44.280107
1,134.735618,105.022643,-29.712975
2,127.529115,82.242766,-45.286349
3,144.527126,93.607172,-50.919953
4,124.21472,103.212223,-21.002497


In [6]:
ttest_1samp(blood_pressure['diff'], 0)

Ttest_1sampResult(statistic=-27.291841767560236, pvalue=7.303035069608042e-48)

ojo: https://stackoverflow.com/questions/15984221/how-to-perform-two-sample-one-tailed-t-test-with-numpy-scipy

## 3. hypothesis test example (independent samples)

assuming equal variances

In [7]:
?ttest_ind

[0;31mSignature:[0m [0mttest_ind[0m[0;34m([0m[0ma[0m[0;34m,[0m [0mb[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mequal_var[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mnan_policy[0m[0;34m=[0m[0;34m'propagate'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Calculate the T-test for the means of *two independent* samples of scores.

This is a two-sided test for the null hypothesis that 2 independent samples
have identical average (expected) values. This test assumes that the
populations have identical variances by default.

Parameters
----------
a, b : array_like
    The arrays must have the same shape, except in the dimension
    corresponding to `axis` (the first, by default).
axis : int or None, optional
    Axis along which to compute test. If None, compute over the whole
    arrays, `a`, and `b`.
equal_var : bool, optional
    If True (default), perform a standard independent 2 sample test
    that assumes equal population var

In [10]:
ttest_ind(ab_test['b'], ab_test['a'], equal_var=True)

Ttest_indResult(statistic=2.637533181209767, pvalue=0.009713140852447347)

assuming unequal variances (Welch's)

In [12]:
import numpy as np

In [13]:
ttest_ind(np.array([1, 2, 3, 4]), ab_test['b'], equal_var=False)

Ttest_indResult(statistic=-8.880552212361339, pvalue=1.0319889242001474e-08)

## 4. chi squared contingency test

**if there is enough time, perform a test for renfe dataset**

In [14]:
from scipy.stats import chi2_contingency

In [15]:
?chi2_contingency

[0;31mSignature:[0m [0mchi2_contingency[0m[0;34m([0m[0mobserved[0m[0;34m,[0m [0mcorrection[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mlambda_[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Chi-square test of independence of variables in a contingency table.

This function computes the chi-square statistic and p-value for the
hypothesis test of independence of the observed frequencies in the
contingency table [1]_ `observed`.  The expected frequencies are computed
based on the marginal sums under the assumption of independence; see
`scipy.stats.contingency.expected_freq`.  The number of degrees of
freedom is (expressed using numpy functions and attributes)::

    dof = observed.size - sum(observed.shape) + observed.ndim - 1


Parameters
----------
observed : array_like
    The contingency table. The table contains the observed frequencies
    (i.e. number of occurrences) in each category.  In the two-dimensional
    case, the ta

In [16]:
renfe = pd.read_csv('../hypothesis_testing/data/renfe.csv')

In [17]:
renfe.columns

Index(['insert_date', 'origin', 'destination', 'start_date', 'end_date',
       'train_type', 'price', 'train_class', 'fare'],
      dtype='object')

In [19]:
pd.crosstab(renfe['train_class'], renfe['fare'])

fare,4x100,Adulto ida,COD.PROMOCIONAL,Doble Familiar-Flexible,Flexible,Grupos Ida,Individual-Flexible,Mesa,Promo,Promo +
train_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Cama G. Clase,0,0,0,31,0,0,214,0,0,0
Cama Turista,0,0,0,0,6137,0,0,0,0,0
Preferente,0,0,44,0,86889,1,0,61,658880,15
PreferenteSólo plaza H,0,0,0,0,5242,0,0,0,1497,0
Turista,1,479771,3216,0,1282739,20,0,33,4052436,55
Turista Plus,0,0,29,0,13601,0,0,66,526004,1
Turista PlusSólo plaza H,0,0,0,0,22,0,0,0,0,0
Turista con enlace,0,0,0,0,203120,0,0,0,83209,187370
TuristaSólo plaza H,0,0,107,0,53316,0,0,0,537,0


In [43]:
chi2, p, dof, ex = chi2_contingency(np.array([[4, 61],[108, 627]]), correction=False)

In [46]:
dof

1

In [None]:
ex

In [20]:
def x():
    return 1, 2, 3

In [21]:
a, b, c = x()

In [22]:
a

1

In [23]:
b

2

In [24]:
c

3