In [18]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

In [19]:
X, y = datasets.make_classification(n_samples=500, n_features=4)

In [20]:
df = pd.DataFrame(X, columns = ['Column_A','Column_B','Column_C', 'Column_D'])

### Simple Random Sampling

###### Say you want to select a subset of a population in which each member of the subset has an equal probability of being chosen.

In [21]:
sample_df = df.sample(100)

In [22]:
sample_df

Unnamed: 0,Column_A,Column_B,Column_C,Column_D
446,0.131510,-0.135237,-0.301335,0.294642
420,-0.255640,2.112345,0.119149,1.498758
0,-0.326456,2.399035,0.227455,1.579644
436,0.557099,-1.356279,-1.078859,0.370704
329,-1.118798,-0.492959,2.978190,-4.347386
...,...,...,...,...
386,0.780510,-1.379696,-1.642826,1.102342
401,1.456757,-2.835057,-3.000608,1.766248
293,1.095159,-2.131240,-2.255818,1.327935
348,0.206039,-1.039499,-0.263300,-0.465367


### Stratified Sampling


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25)

## It produces same percentage of class-0/1 in train and test data.


### Reservoir Sampling

Let us assume we have to sample 5 objects out of an infinite stream such that each element has an equal probability of getting selected.
So, let us think of a stream of only 3 items and we have to keep 2 of them.We see the first item, we hold it in the list as our reservoir has space. We see the second item, we hold it in the list as our reservoir has space.
We see the third item. Here is where things get interesting. We choose the third item to be in the list with probability 2/3.

The probability of removing the first item is the probability of element 3 getting selected multiplied by the probability of Element 1 getting randomly chosen as the replacement candidate from the 2 elements in the reservoir. That probability is:
2/3*1/2 = 1/3

We can have the exact same argument for the Second Element and we can extend it for many elements.
Thus each item has the same probability of getting selected: 2/3 or in general k/n


In [47]:
import random

def generator(max_limit):
    
    num = 1
    while(num < max_limit):
        num+=1
        yield num
        
        
## We have a stream of 10000 numbers here
stream = generator(10000)





In [48]:
## number of elements to select
K = 5
reservoir = []

for i, element in enumerate(stream):
    
    if i + 1 <= K:
        reservoir.append(element)
        
    else:
        
        ## probability to have i+1 item in the list
        probability = K/(i+1)
        ##if probability is less than the probability then that item can be inserted.
        if random.random() < probability:
            # Select item in stream and remove one of the k items already selected(each item has same probba of getting removed)
             reservoir[random.choice(range(0,K))] = element
        
    #print(reservoir)
                
print(reservoir)
        

[141, 553, 7983, 8107, 6492]


### Random Undersampling and Oversampling

It is too often that we encounter an imbalanced dataset.
A widely adopted technique for dealing with highly imbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and/or adding more examples from the minority class (over-sampling).

In [49]:
## Craete imbalanced dataset

X, y = datasets.make_classification(
    n_classes=2, class_sep=1.5, weights=[0.9, 0.1],
    n_features=20, n_clusters_per_class=1,
    n_samples=100, random_state=10
)

In [50]:
X = pd.DataFrame(X)
X['target'] = y

We can now do random oversampling and undersampling using:

In [51]:
class_0 = len(X[X['target']==0])
class_1 = len(X[X['target']==1])

In [52]:
class_0, class_1

(90, 10)

In [64]:
#### Unsersample from majority class
undersample_data = pd.concat([ X[X['target']==0].sample(class_1), X[X['target']==1]])

In [68]:
#undersample_data

We can now do oversampling, oversample from class with less data points using sample with replacement

In [72]:
oversampled_data = pd.concat([ X[X['target']==0] , X[X['target']==1].sample(class_0, replace=True) ])
print(len(oversampled_data))


180


b. Oversampling using SMOTE:
In SMOTE (Synthetic Minority Oversampling Technique) we synthesize elements for the minority class, in the vicinity of already existing elements.