### Importing libraries

In [2]:
import pandas as pd
import numpy as np
from math import ceil
from sklearn.model_selection import train_test_split
from scipy import stats

### Sampling

#### Random Sampling
In this sampling strategy, all population elements have the same probability to be chosen. 

In [6]:
df = pd.read_csv('../data/iris.csv')

In [7]:
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
df.shape

(150, 5)

In [9]:
# definition of the random seed for generating random numbers
# this seed warrants the reproduction of the same results in the next execution
np.random.seed(123)

In [13]:
# generation of 150 random numbers (0 or 1) with replacement and equal probabilities
sample = np.random.choice(a=[0, 1], size=150, replace=True, p=[0.7, 0.3])

In [14]:
len(sample)

150

In [15]:
len(sample[sample==0]) # ~70% dos valores

104

In [16]:
len(sample[sample==1]) # ~30% dos valores

46

In [17]:
df_final = df.loc[sample==0]

In [18]:
df_final.shape

(104, 5)

#### Systematic Sampling
In this sampling strategy, one element is selected at each K elements.

In [19]:
# definition of K
population = 150
sample = 15

K = ceil(population/sample)
K

10

In [20]:
# generation of 1 random number between 1 and K+1
r = np.random.randint(low=1, high=K+1, size=1)
r

array([2])

In [21]:
# systematic selection of elements through the previous defined K
accumulator = r[0]
selected = []

for i in range(sample):
    selected.append(accumulator)
    accumulator += K

selected

[2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102, 112, 122, 132, 142]

In [23]:
len(selected)

15

In [24]:
dataset = pd.read_csv('../data/iris.csv')
df_final = dataset.loc[selected]
df_final.shape

(15, 5)

In [25]:
df_final.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
2,4.7,3.2,1.3,0.2,Iris-setosa
12,4.8,3.0,1.4,0.1,Iris-setosa
22,4.6,3.6,1.0,0.2,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
42,4.4,3.2,1.3,0.2,Iris-setosa


#### Stratified Sampling
In this sampling strategy, the elements of each group or class have the same probability to be chosen. That is, the final sample has the same distribution of elements by groups or classes. 

In [27]:
iris = pd.read_csv('../data/iris.csv')
iris['class'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [28]:
# elements selection based on the 'class' feature, where the elements proportion of each class in the population is kept in the sample
X, _, y, _ = train_test_split(iris.drop('class', axis=1), iris['class'], test_size=0.5, stratify=iris['class'])
y.value_counts()

Iris-versicolor    25
Iris-virginica     25
Iris-setosa        25
Name: class, dtype: int64

In [29]:
infert = pd.read_csv('../data/infert.csv')
infert['education'].value_counts()

6-11yrs    120
12+ yrs    116
0-5yrs      12
Name: education, dtype: int64

In [30]:
# elements selection based on the 'class' feature, where the elements proportion of each class in the population is kept in the sample
X, _, y, _ = train_test_split(infert.drop('education', axis=1), infert['education'], test_size=0.6, stratify=infert['education'])
y.value_counts()

6-11yrs    48
12+ yrs    46
0-5yrs      5
Name: education, dtype: int64