# Notes for Think Stats by Allen B. Downey

In [1]:
from typing import List

import numpy as np
import pandas as pd
import scipy

## Chapter 01

### Glossary

- anecdotal evidence - is an evidence based on personal experience rather than based on well-designed and scrupulous study. 
- cross-sectional study - is a study that colllects data about a population at a particular point in time.
- longitudinal study - is a study that follow the same group repeatedly and collects the data over time.

## Chapter 02

#### Mean - central tendency

$$ \overline{x} = \frac{1}{n} \sum_i x_i \ $$

In [2]:
sample = [1, 3, 5, 6]

In [3]:
np.mean(sample)

3.75

In [4]:
pd.DataFrame(sample).mean()

0    3.75
dtype: float64

#### Variance

$$ S^2 = \frac{1}{n} \sum_i (x_i - \overline{x})^2 $$

In [5]:
np.var(sample)

3.6875

In [6]:
# Warning! Pandas variance by default is normalized by N-1!
# That can be changed by using ddof(delta degrees of freedom) = 0
pd.DataFrame(sample).var(ddof = 0)

0    3.6875
dtype: float64

#### Standard Deviation

$$ \sigma = \sqrt{S^{2}} $$ 

In [7]:
np.std(sample)

1.920286436967152

In [8]:
# Warning! Pandas std is calculated with variance by N-1!
# That can be changed by using ddof(delta degrees of freedom) = 0
pd.DataFrame(sample).std(ddof = 0)

0    1.920286
dtype: float64

#### Effect size - Cohen'd

Having groups **G1** and **G2**, with number of elements given as **N1** and **N2**, the effect size is given as:

$$ Cohen'd = \frac{\overline{G1} - \overline{G2}}{\sqrt{(\sigma (G1) \cdot (N1-1) + \sigma (G2) \cdot (N2-1)) / ((N1-1) + (N2-1))}} $$

In [9]:
def effect_size(g1: pd.DataFrame, g2: pd.DataFrame) -> float:
    diff = g1.mean() - g2.mean()
    var_g1, var_g2 = g1.var(ddof=1), g2.var(ddof=1)
    n1, n2 = len(g1), len(g2)
    
    pooled_var = (var_g1 * (n1 - 1) + var_g2 * (n2 - 1)) / ((n1 - 1) + (n2 - 1))
    cohen_d = diff / np.sqrt(pooled_var)
    return cohen_d

It is calculated with delta degree of freedom = 1!

In [10]:
effect_size(pd.DataFrame([1, 2, 3, 4]), pd.DataFrame([3, 3, 1, 2]))

0    0.219971
dtype: float64

## Chapter 03

#### Probability Mass Function

Probability mass function maps each value to its probability.
Probability of a group always adds to one.

In [11]:
s = pd.Series([1, 2, 3, 4, 2])

In [12]:
def pmf(series: pd.Series) -> pd.Series:
    return series.value_counts().sort_index() / series.count()

In [13]:
pmf(s)

1    0.2
2    0.4
3    0.2
4    0.2
dtype: float64

#### DataFrame Indexing

In [14]:
array = np.random.randn(4, 2)
array

array([[-1.1048087 , -0.36529431],
       [ 0.71452439, -0.61635144],
       [-0.5191532 ,  0.43554197],
       [-0.02195426, -0.44495545]])

In [15]:
df = pd.DataFrame(array)
df

Unnamed: 0,0,1
0,-1.104809,-0.365294
1,0.714524,-0.616351
2,-0.519153,0.435542
3,-0.021954,-0.444955


In [16]:
columns = ['A', 'B']
df = pd.DataFrame(data=array,
                  columns=columns)
df

Unnamed: 0,A,B
0,-1.104809,-0.365294
1,0.714524,-0.616351
2,-0.519153,0.435542
3,-0.021954,-0.444955


In [17]:
index = ['a', 'b', 'c', 'd']
df = pd.DataFrame(data=array,
                  columns=columns,
                  index=index)
df

Unnamed: 0,A,B
a,-1.104809,-0.365294
b,0.714524,-0.616351
c,-0.519153,0.435542
d,-0.021954,-0.444955


In [18]:
df['A']

a   -1.104809
b    0.714524
c   -0.519153
d   -0.021954
Name: A, dtype: float64

In [19]:
df.loc['a']

A   -1.104809
B   -0.365294
Name: a, dtype: float64

In [20]:
df.iloc[0]

A   -1.104809
B   -0.365294
Name: a, dtype: float64

In [21]:
indices = ['a', 'c']
df.loc[indices]

Unnamed: 0,A,B
a,-1.104809,-0.365294
c,-0.519153,0.435542


In [22]:
df['a':'c']

Unnamed: 0,A,B
a,-1.104809,-0.365294
b,0.714524,-0.616351
c,-0.519153,0.435542


In [23]:
df[0:2]

Unnamed: 0,A,B
a,-1.104809,-0.365294
b,0.714524,-0.616351


In [24]:
df[:2]

Unnamed: 0,A,B
a,-1.104809,-0.365294
b,0.714524,-0.616351


In [25]:
df['A'].loc['a']

-1.1048087010998437