In [10]:
import random
from stochatreat import stochatreat
import pandas as pd


In [5]:

percentiles = [75 ,80,85, 88, 90, 91, 92, 93, 94, 95, 96, 97, 98, 98.5, 99, 99.5, 99.8, 99.9, 99.99, 99.999, 100]
dict_precentile = {0 : 75  , 1 : 90 , 2:  98, 3 : 99.5 , 4 : 99.8 , 5 : 99.999 , 6 : 1 }
sorted_percentile = sorted(dict_percentile.items(), key=lambda x: x[1])


def assign_category(percentile):
    for i, (category, threshold) in enumerate(sorted_percentile):
        if percentile <= threshold:
            return category
    return max(dict_percentile.keys())

def blocks_randomization(df, id_col ,stratum_cols ,  ngroups =2 , prop =None , seed: int = 42): 
        # df: DataFrame,
        # id_col: str,
        # stratum_cols: list[str],
        # ngroups: int = 2,
        # prop: Optional[list[Fraction]] = None,
        # seed: int = 42) -> tuple[DataFrame, Union[DataFrame, Series]]:
    """
    Random allocate users within a block in n groups. Users with similar characteristics (features) define a block,
    and randomization is conducted within a block. This enables balanced and homogeneous groups of similar sizes.
    Parameters
    ----------
    df : pd.DataFrame
        Input dataset of users.
    id_col : str
        Column name of the user ids.
    stratum_cols : list
        List of column names to be stratified over
    ngroups : int
        Number of group variations, default 2.
    prop : array_like of floats in interval (0,1)
        Proportions of users in each group. By default, each group has the same amount of users.
    seed : int, default None.
        Seed for random state. The function outputs deterministic results if called more times with equal inputs
        while maintaining the same seed.
    Returns
    -------
    df : pd.DataFrame
        Dataset of users with additional column for the group variation
    stats : pd.DataFrame
        Statistics of the number of users contained in each group
    """

    # Asserts on column names
    assert ('group' not in df.columns), "You cannot have 'group' as column name."
    assert ('treat' not in df.columns), "You cannot have 'treat' as column name."

    df = pd.DataFrame(df).copy()

    # Randomly assign groups by neighborhoods and dummy status.
    treats = stochatreat.stochatreat(data=df, idx_col=id_col, stratum_cols=stratum_cols, treats=ngroups, probs=prop,
                            random_state=seed, misfit_strategy='stratum')

    # Merge back with original data and drop the stratum id columns
    df = df.merge(treats, how='left', on=id_col)
    df.drop(columns=['stratum_id'], inplace=True)
    df.rename(columns={"treat": "group"}, inplace=True)

    # Computer
    stats = df.groupby(stratum_cols)['group'].value_counts().unstack()

    return df, stats

In [11]:

df = pd.read_csv("last 30 days users list with segments.csv")
############# parameter ###############3
kpi = "purchase_amount" 
seed  = random.randint(0, 1000000)

col_id = "user_id"
category = "category"
percentile ='percentile'


quantile_values = df[kpi].quantile([perc / 100 for perc in percentiles])

def assign_percentile(value):
    for i, quant in enumerate(quantile_values):
        if value <= quant:
            return percentiles[i]
    return 100  

df[percentile] = df[kpi].apply(assign_percentile)
df[category] = df['percentile'].apply(assign_category)
df_blocked, stats = blocks_randomization(df, col_id ,[categrory]  , seed = seed)


In [12]:
df_blocked

Unnamed: 0,user_id,is_vip,loyalty_plan,account_age,purchase_amount,total_sc_bet_amount,percentile,category,group
0,254132,1,6. Diamond,more_than_6,6047.91,221188.65,99.80,4,0
1,6267,1,6. Diamond,more_than_6,13677.44,149318.45,99.90,5,0
2,188515,1,6. Diamond,more_than_6,2596.77,16690.40,99.00,3,0
3,137534,1,6. Diamond,more_than_6,17840.00,238352.90,99.99,5,1
4,353255,1,6. Diamond,more_than_6,55213.53,570701.45,99.99,5,0
...,...,...,...,...,...,...,...,...,...
436060,1111816,0,1. No club,month_0,0.00,0.00,75.00,0,1
436061,1114151,0,1. No club,month_0,0.00,0.00,75.00,0,0
436062,1114450,0,1. No club,month_0,0.00,0.00,75.00,0,0
436063,1112666,0,1. No club,month_0,0.00,0.00,75.00,0,0
