In [1]:
import numpy as np

## Create random data with categorical distribution, then resample according to a constraint on the categories

In [191]:
from numpy.random import randint, random, random_sample

In [47]:
binary_random_dic = {i:randint(2) for i in range(1000)}

In [118]:
def balance(dic):
    return np.sum(np.array(list(dic.values())))/(len(dic) + 0.000001)

In [119]:
balance(binary_random_dic)

0.491999999508

In [148]:
# constrained stochastic iterative resampling, just indices = selection:
resampled_dic = {}
constraint = 0.2
for idx, val in binary_random_dic.items():
    bl = balance(resampled_dic)
    if bl > constraint and random(1)[0]>0.2:
        if val==0:
            resampled_dic[idx]=val
    else:
        if val==1:
            resampled_dic[idx]=val

In [285]:
resampled_dic_method_2 = {}
constraint = 0.2
# as subgroups 1 and 0 are independent we can subsample each subgroup of data
rands = np.random.random_sample((len(binary_random_dic),))
for idx, val in binary_random_dic.items():
    
    if val == 1:
        # subgroup 2 - accept with probability x*balance = 0.2 => x = 0.4
        if rands[idx]<=(balance(binary_random_dic) - constraint):
            resampled_dic_method_2[idx]=val
    else:
        resampled_dic_method_2[idx]=val
display(balance(resampled_dic_method_2))

0.21362229069098718

In [254]:
balance(binary_random_dic)

0.491999999508

In [286]:
display(balance(resampled_dic))


0.20362903184752212

In [359]:
display(balance(resampled_dic_method_2))

0.21362229069098718

## Create a more complicated example using categorical string variables

In [360]:
import pandas as pd
df = pd.DataFrame(columns=['gender','age', 'suitability_score'])
# generate multivariate uniform random distribution along age, gender and suitability score
gender_map={0:'m', 1:'f'}
for i in range(1000):
    suitascore = 0.2*random_sample()+0.6
    df = df.append({'gender': gender_map[randint(2)], 'age': 1 + randint(100), 'suitability_score': suitascore}, ignore_index=True)
    
df.head(10)

Unnamed: 0,gender,age,suitability_score
0,m,15,0.689838
1,m,44,0.681925
2,m,54,0.633698
3,m,29,0.748334
4,m,41,0.658691
5,f,15,0.77067
6,f,95,0.632892
7,f,28,0.661164
8,f,18,0.620986
9,f,98,0.625067


In [361]:
gender_reverse = {'m':0, 'f':1}

In [362]:
# now define an optimization function and optimize it stochastically

# we want 100 people in the age group of 30 - 50 (distributional constraint? e.g like uniform or gaussian):
# AND those people should have a suitability score in the range of 0.6-0.7 (uniform)
# AND those people should be 30% woman

# one could plot different dimensions of the target distribution

# define an loss function based on the target distribution which can be optimized with stochastic iterative sampling:

def loss_function(genders, ages, suitascores):
    # based on product when AND condition is used, optimizing the loss iteratively means to select a batch of
    # randomly subsampled datapoints it accepts them with probability p without replacement
    # so the loss is measured on the whole distribution obvisiouly and not on a single datapoint
    
    # genders + m/f, measure similarity
    
    gender_ratio = np.sum([gender_reverse[i] for i in genders])
    gender_loss = 1 - np.log(abs(0.3 - gender_ratio))
    # TODO
    # for the age, penalize 
    # for single datapoints for male female there could be a probabilistic loss
    # for something like age there could be a hard constraint

def loss_single_single_datapoint(gender, age, suitascore, current_gender_ratio):
    
    age_low, age_high = 30, 50
    age_loss = 0.05
    if age_low <= age <= age_high:
        age_loss = 0.95
    
    suita_loss = 0.05
    suita_low, suita_high = 0.6, 0.7
    
    if suita_low <= suitascore <=suita_high:
        suita_loss = 0.95
        
    gender_loss = 0.1
    gender_ratio_target = 0.3
    
    # if the current_gender_ratio is bigger than target have a higher probability of accepting if the gender is male
    # else have a higher probability of accepting the female
    
    if current_gender_ratio > gender_ratio_target + random_sample()*0.1 -0.05:
        if gender=='m':
            gender_loss = 0.9
    else:
        if gender=='f':
            gender_loss = 0.9
        
    
    return gender_loss*age_loss*suita_low

In [366]:
def get_gender_ratio(df):
    return np.sum([gender_reverse[i] for i in df.gender.tolist()])/len(df)
    
df_subsampled = pd.DataFrame(columns=['gender','age', 'suitability_score'])
gr = 0.5
for idx, r in df.iterrows():
    p_acc = loss_single_single_datapoint(r.gender, r.age, r.suitability_score, gr)
    #print(p_acc)
    if p_acc > 0.4:
        df_subsampled = df_subsampled.append(r)
        gr = get_gender_ratio(df_subsampled)
    
df_subsampled.head(10)

Unnamed: 0,gender,age,suitability_score
1,m,44,0.681925
23,f,38,0.656266
51,m,45,0.611501
56,m,34,0.788928
66,f,34,0.623615
70,m,32,0.673853
75,m,47,0.734084
80,m,35,0.733243
85,f,41,0.63236
90,m,35,0.690068


In [367]:
len(df_subsampled)

97

In [368]:
get_gender_ratio(df_subsampled)

0.32989690721649484