In [398]:
import numpy as np
import pandas as pd

def select_stratified_groups(data, strat_columns, group_size, weights=None, seed=None):
    """Подбирает стратифицированные группы для эксперимента.

    data - pd.DataFrame, датафрейм с описанием объектов, содержит атрибуты для стратификации.
    strat_columns - List[str], список названий столбцов, по которым нужно стратифицировать.
    group_size - int, размеры групп.
    weights - dict, словарь весов страт {strat: weight}, где strat - tuple значений элементов страт,
        например, для strat_columns=['os', 'gender', 'birth_year'] будет ('ios', 'man', 1992).
        Если None, определить веса пропорционально доле страт в датафрейме data.
    seed - int, исходное состояние генератора случайных чисел для воспроизводимости
        результатов. Если None, то состояние генератора не устанавливается.

    return (data_pilot, data_control) - два датафрейма того же формата что и data
        c пилотной и контрольной группами.
    """
    # YOUR_CODE_HERE
    result = None
    for strat_column in strat_columns:
        if result is None:
            result = data[strat_column].astype(str)
        else:
            result = result + '_' + data[strat_column].astype(str)
    data['strat_key'] = result

    if weights is None:
        weights = data.strat_key.value_counts(normalize=True).to_dict()
    else: 
        new_weights = {}
        if isinstance(list(weights.keys())[0], tuple):
            for weight in weights:
                key = ''
                for value in weight:
                    key = f'{key}_{value}'
                new_weights[key[1:]] = weights[weight]
        else:
            for weight in weights:
                new_weights[str(weight)] = weights[weight]
        weights = new_weights
    
    weights = dict(sorted(weights.items(), key=lambda item: item[1]))
    
    strat_names = list(weights.keys())
    weight_values = np.array(list(weights.values()))
    print(f'strat_names: {strat_names}')
    print(f'weight_values: {weight_values}')
    #stratum_sizes = (np.ceil(weight_values * group_size) * 2).astype(int)
    stratum_sizes = (np.ceil(weight_values * group_size)).astype(int)
    print(f'stratum_sizes: {stratum_sizes}')
    delta = group_size - np.sum(stratum_sizes)
    
    
    print(f'delta: {delta}')
    extra_deltas = delta * weight_values
    print(f'extra_deltas: {extra_deltas}')
    extra_deltas_copy = extra_deltas.copy()

    for i in range(len(extra_deltas)):
        extra_deltas_copy = extra_deltas.copy()
        extra_deltas_copy[:i] = np.floor(extra_deltas_copy[:i])
        extra_deltas_copy[i:] = np.ceil(extra_deltas_copy[i:])
        print(f'extra_deltas_copy sum: {extra_deltas_copy.sum()}')
        if int(extra_deltas_copy.sum()) == delta:
            break
            
    print(f'stratum_sizes: {stratum_sizes}; extra_deltas_copy: {extra_deltas_copy}')
    stratum_sizes += extra_deltas_copy.astype(int)
    print(f'stratum_sizes new: {stratum_sizes}')
    
#     print(f'delta: {delta}')
#     extra_term = weight_values / np.sum(weight_values)
#     deltas = np.floor(delta * weight_values + extra_term).astype(int)
#     delta -= np.sum(deltas)
#     while delta > 1:
#         print(f'\ndelta: {delta}; deltas: {deltas}')
#         extra_deltas = np.floor(delta * weight_values + extra_term).astype(int)
#         print(f'extra_deltas: {extra_deltas}')
#         deltas += extra_deltas
#         delta -= np.sum(extra_deltas)
#     print(f'Adding {delta} to the final category')
#     deltas[-1] += delta
#     print(f'FINAL: delta: {delta}; deltas: {deltas}; total deltas: {np.sum(deltas)}')
#     print(f'stratum_sizes: old: {stratum_sizes}; new: {stratum_sizes+deltas}')
#     stratum_sizes+=deltas
    
    pilot = pd.DataFrame()
    control = pd.DataFrame()
    sizes_count = 0
    for i, k in enumerate(weights):
        strat_size_both = stratum_sizes[i] * 2#2 * np.ceil(group_size * weights[k]).astype(int)
        sizes_count += strat_size_both
        if i == len(weights)-1 and sizes_count != group_size * 2:
            print(f'Updating original strat_size_both of {strat_size_both}')
            strat_size_both += group_size * 2 - sizes_count
        sub_df = data[data['strat_key'] == k]
        sample = sub_df.sample(n=int(strat_size_both), random_state=seed)
        print(f'sample shape for stratum {k}: {sample.shape}')
        pilot = pd.concat((pilot, sample.iloc[:len(sample) // 2]), ignore_index=True)
        control = pd.concat((control, sample.iloc[len(sample) // 2:]), ignore_index=True)
        
    data.drop(columns=['strat_key'], inplace=True)
    pilot.drop(columns=['strat_key'], inplace=True)
    control.drop(columns=['strat_key'], inplace=True)
    
    return (pilot, control)

In [None]:
    
#     # in data not in pilot
#     pilot_control = pd.concat((pilot, control), axis=0).reset_index(drop=True)
#     print(f'pilot_control shape: {pilot_control.shape}')
#     col_names = ['df1_' + i + '_df2_' + j for i, j in zip(data, pilot_control)]
#     data.columns = col_names
#     pilot_control.columns = col_names
#     data_not_pc = pd.concat([pilot_control, data, data], sort=False).drop_duplicates(keep=False)
#     print(f'data_not_pc shape: {data_not_pc.shape}')

In [80]:
size = 999
strat_columns2 = ['c1', 'c2']
df2 = pd.DataFrame({'c0': np.random.choice(['q', 'w'], replace=True, size=size), 
    'c1': np.random.choice(['a', 'b'], replace=True, size=size), 
                   'c2': np.random.choice(['d', 'e'], replace=True, size=size),
                   'c3': np.random.choice(['r', 't', 'y'], replace=True, size=size)
                  })
df2

Unnamed: 0,c0,c1,c2,c3
0,w,b,e,r
1,w,b,d,y
2,w,a,e,y
3,w,a,e,r
4,w,b,d,y
...,...,...,...,...
994,w,b,d,t
995,w,b,d,t
996,w,b,d,y
997,q,a,d,r


In [118]:
df2['c0'].value_counts() / df2['c0'].count()

a    0.547445
b    0.452555
Name: c0, dtype: float64

In [119]:
df2['c0'].value_counts(normalize=True)

a    0.547445
b    0.452555
Name: c0, dtype: float64

In [161]:
df2.astype(df2.dtypes.to_dict())

Unnamed: 0,c0,c1,c11,c2,5
0,b,2,0,e,t
1,a,2,1,e,t
2,a,1,2,e,t
3,a,2,3,e,t
4,b,2,4,e,r
...,...,...,...,...,...
994,b,1,994,e,r
995,b,1,995,e,t
996,b,2,996,d,t
997,a,1,997,e,t


In [324]:
np.random.seed(None)
size = 10001
strat_columns2 = ['c1', 'c2']
df2 = pd.DataFrame({'c0': np.random.choice(['a', 'b'], replace=True, size=size),
                    'c1': np.random.choice([1,2], replace=True, size=size),
                    'c11': np.arange(size),
                    'c2': np.random.choice(['d', 'e'], replace=True, size=size),
                    5: np.random.choice(['r', 't'], replace=True, size=size)
                    })

df2.loc[np.random.randint(0, len(df2), size=size//10), 'c0'] = 'a'
df2.loc[np.random.randint(0, len(df2), size=size//10), 'c2'] = 'e'

In [345]:
df2.groupby(['c0', 'c2'])['c0'].count() / size

c0  c2
a   d     0.243976
    e     0.303770
b   d     0.203880
    e     0.248375
Name: c0, dtype: float64

In [343]:
# weights = []
# num_weights = 4
# cumsum = 1
# for i in range(num_weights-1):
#     w = round(np.random.uniform(0, cumsum),2)
#     cumsum -= w
#     weights.append(w)
# weights.append(round(1 - np.sum(weights), 2))
# weights = sorted(weights)
# weights = np.array(weights)
# print(weights, np.sum(weights))

In [344]:
# delta = 5

# deltas = np.floor(delta * weights + 0.5).astype(int)
# delta -= np.sum(deltas)
# while delta > 1:
#     print(f'\ndelta: {delta}; deltas: {deltas}')
#     extra_deltas = np.floor(delta * weights + 0.5).astype(int)
#     print(f'extra_deltas: {extra_deltas}')
#     deltas += extra_deltas
#     delta -= np.sum(extra_deltas)
# print(f'Adding {delta} to the final category')
# deltas[-1] += delta
# print(f'FINAL: delta: {delta}; deltas: {deltas}; total deltas: {np.sum(deltas)}')

In [399]:
np.random.seed(1)
size = 100001
strat_columns2 = ['c1', 'c2']
df2 = pd.DataFrame({'c0': np.random.choice(['a', 'b'], replace=True, size=size),
                    'c1': np.random.choice([1,2], replace=True, size=size),
                    'c11': np.arange(size),
                    'c2': np.random.choice(['d', 'e'], replace=True, size=size),
                    5: np.random.choice(['r', 't'], replace=True, size=size)
                    })

df2.loc[np.random.randint(0, len(df2), size=size//10), 'c0'] = 'a'
df2.loc[np.random.randint(0, len(df2), size=size//10), 'c2'] = 'e'


p, c = select_stratified_groups(data=df2, strat_columns=['c0', 'c2'], group_size=300
                                ,weights={('a', 'e'): 0.13, ('a', 'd'): 0.29, ('b', 'e'): 0.22, ('b', 'd'): 0.34}
                                ,seed=1)

# p, c = select_stratified_groups(data=df2.copy(), strat_columns=['c0', 'c2'], group_size=100, 
#                                 weights=None, 
#                                 seed=1)
print(f'p shape: {p.shape}')
print(p.head())
p['strat_key'] = p.c0.astype(str) + '_' + p.c2.astype(str)
p.strat_key.value_counts(normalize=True)

strat_names: ['a_e', 'b_e', 'a_d', 'b_d']
weight_values: [0.13 0.22 0.29 0.34]
stratum_sizes: [ 39  66  87 103]
delta: 5
extra_deltas: [0.65 1.1  1.45 1.7 ]
extra_deltas_copy sum: 7.0
extra_deltas_copy sum: 6.0
extra_deltas_copy sum: 5.0
stratum_sizes: [ 39  66  87 103]; extra_deltas_copy: [0. 1. 2. 2.]
stratum_sizes new: [ 40  67  88 105]
sample shape for stratum a_e: (80, 6)
sample shape for stratum b_e: (134, 6)
sample shape for stratum a_d: (176, 6)
sample shape for stratum b_d: (210, 6)
p shape: (300, 5)
  c0  c1    c11 c2  5
0  a   1  63662  e  r
1  a   1  22738  e  r
2  a   1   8837  e  t
3  a   2  70267  e  r
4  a   2  99309  e  r


b_d    0.350000
a_d    0.293333
b_e    0.223333
a_e    0.133333
Name: strat_key, dtype: float64

In [396]:
2 / 300

0.006666666666666667

In [381]:
group_size=115
stratum_sizes = np.array([[13, 22, 29, 34]])
weights = np.array([0.13, 0.22, 0.29, 0.34])

delta = group_size - stratum_sizes.sum()
print(f'delta: {delta}')
extra_deltas = delta * weights
print(f'extra_deltas: {extra_deltas}')
extra_deltas_copy = extra_deltas.copy()

for i in range(len(extra_deltas)):
    extra_deltas_copy = extra_deltas.copy()
    extra_deltas_copy[:i] = np.floor(extra_deltas_copy[:i])
    extra_deltas_copy[i:] = np.ceil(extra_deltas_copy[i:])
    print(f'extra_deltas_copy sum: {extra_deltas_copy.sum()}')
    if int(extra_deltas_copy.sum()) == delta:
        break

# next_int_distance = np.ceil(extra_deltas_copy) - extra_deltas_copy
# print(f'next_int_distance: {next_int_distance}')

delta: 17
extra_deltas: [2.21 3.74 4.93 5.78]
extra_deltas_copy sum: 18.0
extra_deltas_copy sum: 17.0


In [382]:
extra_deltas_copy

array([2., 4., 5., 6.])

In [366]:
10 * 0.25

2.5

In [350]:
2 / 300

0.006666666666666667

In [337]:
np.sum([ 40,  67,  88 ,105])

300

In [342]:
print(f'c shape: {c.shape}')
print(c.head())
c['strat_key'] = c.c0.astype(str) + '_' + c.c2.astype(str)
c.strat_key.value_counts(normalize=True)

c shape: (300, 5)
  c0  c1   c11 c2  5
0  b   2  9263  d  t
1  b   1  9196  d  r
2  b   1  7625  d  t
3  b   1  1021  d  r
4  b   2  3616  d  t


a_e    0.306667
b_e    0.246667
a_d    0.243333
b_d    0.203333
Name: strat_key, dtype: float64

In [9]:
df2.dtypes

c0     object
c1      int32
c11     int32
c2     object
5      object
dtype: object

In [10]:
p, c = select_stratified_groups(data=df2.copy(), strat_columns=['c1'], group_size=100, 
                                weights={(1): 0.2, (2): 0.8}, 
                                seed=None)

print(p.head())
p['strat_key'] = p.c1.astype(str)# + '_' + p.c2.astype(str)
p.strat_key.value_counts(normalize=True)

sample shape: (40, 6)
sample shape: (160, 6)
  c0  c1  c11 c2  5
0  b   1  498  e  r
1  b   1  800  d  r
2  a   1  565  e  t
3  b   1  567  e  t
4  a   1  592  d  t


2    0.8
1    0.2
Name: strat_key, dtype: float64

In [98]:
p, c = select_stratified_groups(data=df2.copy(), strat_columns=['c2'], group_size=100, 
                                weights={('d'): 0.2, ('e'): 0.8}, 
                                seed=None)

print(p.head())
p['strat_key'] = p.c2.astype(str)# + '_' + p.c2.astype(str)
p.strat_key.value_counts(normalize=True)

weights: {'d': 0.2, 'e': 0.8}
  c0  c1  c11 c2 c3
0  b   1    4  d  r
1  b   2    3  d  r
2  b   2    3  d  r
3  a   2    4  d  r
4  a   1    4  d  r


e    0.8
d    0.2
Name: strat_key, dtype: float64

In [83]:
p

Unnamed: 0,c0,c1,c11,c2,c3
0,a,2,4,e,t
1,a,1,3,e,r
2,a,1,4,e,y
3,a,1,3,e,y
4,a,2,3,e,t
5,a,1,4,e,y
6,a,1,4,e,r
7,a,1,4,e,t
8,a,2,3,e,t
9,a,1,4,e,t


In [85]:
p['strat_key'] = p.c1.astype(str) + '_' + p.c2.astype(str)
p.strat_key.value_counts(normalize=True)

1_d    0.46
2_d    0.24
1_e    0.20
2_e    0.10
Name: strat_key, dtype: float64

In [34]:
p.shape

(50, 5)

In [35]:
c['strat_key'] = c.c1 + '_' + c.c2
c.strat_key.value_counts(normalize=True)

b_d    0.4
a_d    0.3
a_e    0.2
b_e    0.1
Name: strat_key, dtype: float64

In [36]:
c.shape

(50, 5)

In [9]:
qwe

NameError: name 'qwe' is not defined

In [None]:
empirical_weights = {}
for strat_column in strat_columns:
    q = df.groupby(strat_column).agg(cnt=(strat_column, 'count'))
    q = q / q.sum()
    empirical_weights[strat_column] = q.to_dict()['cnt']
empirical_weights

In [None]:
result = None
for strat_column in strat_columns:
    if result is None:
        result = df[strat_column].astype(str)
    else:
        result = result + '_' + df[strat_column].astype(str)
df['strat_key'] = result
df

In [None]:
weights = {('a', 'e'): 0.2, ('a', '')}

In [None]:
df.strat_key.value_counts(normalize=True).to_dict()

In [None]:
weights = {('a', 'e'): 0.2, ('a', 'd'): 0.3, ('b', 'e'): 0.1, ('b', 'd'): 0.4}
#weights
{'_'.join(weight): weights[weight] for weight in weights}

In [None]:
seed=None
group_size = 200
chunks = []
pilot = pd.DataFrame()
control = pd.DataFrame()
for weight in weights:
    strat_key = '_'.join(weight)
    strat_size_both = np.ceil(group_size * weights[weight]).astype(int)
    print(weight, strat_size_both)
    sub_df = df[df.strat_key == strat_key]
    print(sub_df)
    sample = sub_df.sample(n=int(strat_size_both), random_state=seed)
    pilot = pd.concat((pilot, sample.iloc[:len(sample)//2]), ignore_index=True)
    control = pd.concat((control, sample.iloc[len(sample)//2:]), ignore_index=True)

In [None]:
pilot.strat_key.value_counts()

In [None]:
control.strat_key.value_counts()

In [None]:
pilot

In [None]:
control