In [2]:
import numpy as np
from scipy import stats

# Simple Sampling

Random sampling is randomly selecting samples from the population.

In [3]:
# Let's create a normal population
population_size = 1000000
population_mean = 3 # mean
population_stdev = 2 # standard deviation

# The population, in a numpy array
population_normal = stats.norm.rvs(size=population_size,loc=population_mean,scale=population_stdev)

In [4]:
# Let's see the first 10 of our normal population 
population_normal[:10]

array([ 4.45078761,  1.67864093,  3.23619755,  4.51179708,  2.10890937,
       -0.63747515,  1.01798538,  2.67280378,  4.5510385 , -2.88205865])

In [5]:
# Randomly sampling from our normal population
import random

def random_sample(population, number_to_sample):
    sample = []
    
    # Going to track of who we've sampled already here
    seen_indices = []
    
    for i in range(0, number_to_sample):
        # We are going to use the random library to randomly choose an index in our population
        random_choice_index = random.randint(0, len(population))
        
        # Making sure that we are double counting the same sample we've chosen in the past
        while random_choice_index in seen_indices:
            random_choice_index = random.randint(0, len(population))
        
        # This is how you get the actual ELEMENT in an array, given an index
        random_choice = population[random_choice_index]
        seen_indices.append(random_choice_index)
        
        # add my random choice to my sample
        sample.append(random_choice)
        
    return sample

**If we rerun the next two cells, we should see the sample mean and stdev change A LOT**

In [11]:
# When my sample is small compared to my population

sample_number = 10
sample = random_sample(population_normal, sample_number)

print(sample)

[1.740250593917616, 5.115930930372182, 4.661753678612627, 3.978086682691522, 2.203407493316655, 3.946291950486846, 1.9869865578636074, 2.20095238405269, 5.118440393280183, 1.1901193724594847]


In [12]:
import statistics # yet another library!

# using f strings to print out things
## You can use the backslash at the end of string to put an arbitrary enter and break up the string
###  it doesn't change the string, but it makes life easier to read for you the Programmer
print(f"The mean of this sample is: {statistics.mean(sample)} \
        but our population mean is {population_mean}.")

print(f"The standard deviation of this sample is: {statistics.stdev(sample)} \
        but our population mean is {population_stdev}.")

The mean of this sample is: 3.2142220037053413         but our population mean is 3.
The standard deviation of this sample is: 1.5011647121242706         but our population mean is 2.


In [13]:
# Also did you know there was a function provided by random that randomly samples?

## But for this function to work, the population_normal object must be converted from a np.array to list
sample_10_from_random_lib = random.sample(population_normal.tolist(), sample_number)

In [14]:
sample_10_from_random_lib

[3.605911034456616,
 -2.364900398156588,
 2.4227537731170528,
 6.50665241052579,
 3.4753200191547506,
 0.20313627227882325,
 4.387064076629358,
 2.692613071008091,
 3.010796904323776,
 3.226877621933227]

# Systemic Sampling

Dark and famous examples Systemic Sampling (Murder, Mayhem Oh my!)

In [15]:
# let's write a function that takes the Kth sample in a population for our dataset

def system_sampling(population, k):
    
    # in this list comprehension, we are using python List Comprehensions and the property of range()
    # range takes in the start integer, the end integer, and an optional 3rd parameter which is
    ### how big of an interval / difference between the numbers it spits out in order?  Default is 1
    
    #Wasn't this code so much easier to read???? 
    return [population[i] for i in range(0, len(population), k)]

In [16]:
decimate = system_sampling(population_normal, 10)

In [17]:
# This is a test proving that our sampling technique gave us a sample that is of size
# the population / k
assert len(decimate) == len(population_normal) // 10

# Stratified Random Sampling

In [19]:
# We need to define classes of some sort for stratified random sampling.  
## To make life easy, the class can be based off the values.  We will use BINS as our classes

## There are plenty of ways to randomly make a class, like randomly assigning a class to each datapoint.

# this // symbol is doing floor division.  I'm using to convert a decimal into an integer
def make_classes(population, num_classes):
    largest_population_val_int = max(population) // 1 + 1 # adding 1 to make sure our bins are larger
    min_population_val_int = min(population) // 1 - 1 # subtracting to make sure our bins are larger
    
    bin_width = (largest_population_val_int - min_population_val_int) // num_classes
    
    return [min_population_val_int + i * bin_width for i in range(0, num_classes)]

In [20]:
# let's start with 5 classes 
my_population_classes = make_classes(population_normal, 5)

In [21]:
# The value in the list is the lower bound of the bin 
my_population_classes

[-8.0, -4.0, 0.0, 4.0, 8.0]

** It doesn't matter how many times I run the above two lines of code, the output will never change**

Try it

In [22]:
# Let's reformat our population into a pandas dataframe,
# because it'll count categorical data for us out of the box

import pandas as pd

population_normal_df = pd.DataFrame(columns = ["value", "class"])
population_normal_df["value"] = population_normal

# convert the value into a class
def get_class_from_value(value, classes_array):
    for i in range(1, len(classes_array)):
        
        if value < classes_array[i]:
            return classes_array[i - 1]

In [23]:
population_normal_df["class"] = [get_class_from_value(val, my_population_classes) 
                                 for val in population_normal]

In [24]:
print(my_population_classes)
population_normal_df.head()

[-8.0, -4.0, 0.0, 4.0, 8.0]


Unnamed: 0,value,class
0,4.450788,4.0
1,1.678641,0.0
2,3.236198,0.0
3,4.511797,4.0
4,2.108909,0.0


In [25]:
# Let's look at the class distributions
# This is what we want to maintain!!!
def _create_class_dist(population_df, class_column):
    return population_df[class_column].value_counts() / len(population_normal)

_create_class_dist(population_normal_df, "class")

 0.0    0.624263
 4.0    0.302932
-4.0    0.066403
-8.0    0.000247
Name: class, dtype: float64

In [27]:
population_normal_df.index

RangeIndex(start=0, stop=1000000, step=1)

In [28]:
# Now we can finally get our stratified sample.  WOW that was a lot of coding!

def _per_class_target_sample_n(target_class, sample_n, class_distribution_np):
    return sample_n * class_distribution_np[target_class] // 1

def stratefied_sampling(population, sample_n):
    sample = []
    
    class_distribution_np = _create_class_dist(population, "class") 
    
    # get the exact number of each class we need for N samples
    target_class_sample_n = class_distribution_np * sample_n // 1
    
    for pop_class in class_distribution_np.index:
        # This is how you sort a pandas df to get ONLY THE ROWS where column "class" == popclass
        ## The sql equivalent is: SELECT * FROM population WHERE class EQUALS pop_class
        
        this_class_df = population.loc[population["class"] == pop_class]
        
        # Doing some class changing to make libraries happy.  This is life
        target_class_sample_n_int = int(target_class_sample_n[pop_class])
        
        if target_class_sample_n_int > 0:
            this_class_values_list = this_class_df["value"].tolist()
            
            class_random_sample = random.sample(this_class_values_list, 
                                                target_class_sample_n_int)
        
            sample = sample + class_random_sample
    
    return sample
    

In [31]:
my_sample = stratefied_sampling(population_normal_df, 100)
my_sample[:10]

[3.8434599462363606,
 3.696462096797834,
 2.40232273759965,
 1.8542682035923883,
 0.4453812663119603,
 3.2745024638846876,
 1.7811975967259954,
 3.9448834321589494,
 0.8933681959633071,
 3.48581511658956]

In [32]:
_create_class_dist(my_sample, my_population_classes)

TypeError: list indices must be integers or slices, not list

## Making a stratified sample of 10

Notice that the final sample only has 9 samples in it!  What gives!!

This has to do with the function we wrote and the small sample we took when related to the number of classes we made.  Some of the classes are going to have smaller probabilities and at a small n, those probabilities are close to zero.

In the code, we are using floor division ruthlessly to make various libraries happy (after all you can't pick 6.5 samples from a population!).  

In the end, the usage of floor division, which will ALWAYS round down, means that cumultively we lose a sample when n is small.  

In [33]:
sample_10 = stratefied_sampling(population_normal_df, 10)

In [34]:
print(sample_10)
print(len(sample_10))

[3.2935651663257133, 3.3571521758088116, 0.9098939972352134, 0.45990637793764133, 3.5800076552583944, 3.0016606831745545, 4.454481089054768, 4.131116834703995, 4.296771894418211]
9
