# Chapter 1

- population = whole dataset
- sample = subset of population
    - sampling with replacement (for dependent event): `df["col"].sample(5, replace = True)`
    - sampling without replacement (for independent event): `df["col"].sample(5, replace = False)`

### Random numbers

```
# Generate 10 uniform random values between 0 to 5
from scipy.stats import uniform
uniform.rvs(0, 5, size=10)
# Generate binomial random values
from scipy.stats import binom
binom.rvs(1, 0.5, size=8) # 1 coin, flip 8 times, probability of success 50%
binom.rvs(8, 0.5, size=1) # 8 coins, flip 1 time, probability of success 50%
binom.rvs(3, 0.5, size=10) # 3 coins, flip 10 times, probability of success 50%

# Generate 10 random normal values with mean 161 and std of 7
from scipy.stats import norm
norm.rvs(161, 7, size=10)

# Generate 10 random poisson values with lambda of 8
from scipy.stats import poisson
poisson.rvs(8, size=10)

# Generate 10 random values from a t-distribution with 5 degrees of freedom
from scipy.stats import t
t.rvs(df=5, size=10)

# Generate 10 random values from a log-normal distribution with mean 1.5 and standard deviation 0.8
from scipy.stats import lognorm
lognorm.rvs(s=0.8, scale=1.5, size=10)

#### Alternatives ###########
import numpy as np
np.random.seed(42)
random_beta = np.random.beta(a=2, b=2, size=5000)
random_normal = np.random.normal(loc=2, scale=1.5, size=2)
random_uniform =  np.random.uniform(low=-3, high=3, size = 5000)

# Visualize
plt.hist(uniforms, bins = np.arange(-3,3.1,0.25))
plt.show()
```

# Chapter 2

- Systematic sampling : Sampling by taking every n-th element in a shuffled dataset
- Simple random sampling : Sampling by taking purely random rows in a dataset
- Stratified samping : sampling by keeping proportions of subgroups in account
- Weighted sampling : sampling by adding weights to subgroups to adjust relative probability of a row being sampled
- cluster sampling : First randomly pick subgroups of dataset, then randomly sample rows from those subgroups

### Sampling

```
# Visualize sampling distribution
df_sample = df.sample(n=10)
df_sample["col"].hist(bins=np.arange(59, 93, 2))
plt.show()
# Sampling with replacement (for dependent event)
df["col"].sample(5, replace = True)
# Sampling without replacement (for independent event)
df["col"].sample(5, replace = False)

# Simple random sampling
simple_sample = df.sample(n=5, random_state=42)

# Systematic sampling
sample_size = 5
pop_size = len(df)
interval = pop_size // sample_size
shuffled_df = df.sample(frac=1)
shuffled_df = shuffled_df.reset_index(drop=True).reset_index()
systematic_sample = shuffled_df.iloc[::interval]

# Stratified sampling
prop_stratified_sample = df.groupby("cat_col").sample(frac=0.1, random_state=42)
equal_stratified_sample = df.groupby("cat_col").sample(n=15, random_state=42)

# Weighted sampling
condition = df['cat_col'] == "Val"
df['weight'] = np.where(condition, 2, 1)
weighted_sample = df.sample(frac=0.1, weights="weight")

# Cluster sampling
category_list = list(df['cat_col'].unique())
import random
random_categories = random.sample(category_list, k=3)
subset_rows = df['cat_col'].isin(random_categories)
subset_df = df[subset_rows]
subset_df['cat_col'] = subset_df['cat_col'].cat.remove_unused_categories()
sample_cluster = subset_df.groupby("cat_col").sample(n=5, random_state=42)

# Visualize to make sure white noise so that sampling is random
sample_df.plot(x="col1", y="col2", kind="scatter")
plt.show()
```