In [3]:
import pandas as pd
import numpy as np
from itertools import combinations
from statsmodels.formula.api import ols

In [4]:
np.random.seed(0)

df = pd.DataFrame(data=np.random.rand(10, 2), columns=['a', 'b'])
assignment = set(np.random.choice(df.index, size=5, replace=False))

In [5]:
def assignment_indices(df, assignments):
    idxs = [df.index.isin(a) for a in assignments]
    total_assignments = np.add(*idxs) if len(idxs) > 1 else idxs[0]
    if not total_assignments.min():
        idxs.append(~total_assignments)
    return idxs

assignment_indices(df, [assignment, [2, 3]])

[array([False, False, False, False,  True,  True,  True,  True, False,
         True]),
 array([False, False,  True,  True, False, False, False, False, False,
        False]),
 array([ True,  True, False, False, False, False, False, False,  True,
        False])]

In [6]:
def mahalanobis(df, assignments, agg_assign=np.max):
    means = [df.loc[idx].mean() 
             for idx in assignment_indices(df, assignments)]
    inverse_cov = np.linalg.inv(df.cov())
    mean_diffs = [m1 - m2 for m1, m2 in combinations(means, 2)]    
    return agg_assign([mean_diff @ inverse_cov @ mean_diff for mean_diff in mean_diffs])

In [7]:
mahalanobis(df, [assignment, [2, 3]], agg_assign=lambda x: x)

[1.1719512986317402, 0.999298988844019, 0.08496754158529106]

In [8]:
def ols_on_treatment(col, df, assignments):
    treatment_dummies = pd.DataFrame(
        dict((('t{}'.format(i), df.index.isin(assignment)) for i, assignment in enumerate(assignments)))
    )
    data = pd.concat((df, treatment_dummies), axis=1)
    return ols('{} ~ 1 + {}'.format(col, ' + '.join(treatment_dummies.columns)), 
               data=data).fit()


def pvalues(df, assignment):
    return dict((col, 
                 ols_on_treatment(col, df, assignment).pvalues.iloc[1:].values) for col in df.columns)    

In [9]:
pv = pd.DataFrame(pvalues(df, [assignment, [2, 3]]))
pv

Unnamed: 0,a,b
0,0.320395,0.523023
1,0.892326,0.790063


In [27]:
d = {'a':1, 'b':2}
next(iter(d.values()))

1

In [14]:
def count_by_col(col, df, assignments):
    cat = sorted(list(set(df[col])))
    idxs = assignment_indices(df, assignments)
    count = [[sum(df[col].loc[idx]==v) for v in cat] for idx in idxs]
        
    return pd.DataFrame(data=count, columns=cat, index=['t{}'.format(i) for i in range(len(idxs))])
    

def relative_block_balance(df_count, agg_assignments=lambda x: np.max(np.abs(x),axis=0)):
    relative_dev = (df_count - df_count.median())/df_count.median()
    return agg_assignments(relative_dev)

df['cat'] = 0. * (df['a'] < .3) + 1. * df['a'].between(.3, .6) + 2. * (df['a'] > .6)
print(count_by_col('cat', df, [assignment]), '\n')
print(relative_block_balance(count_by_col('cat', df, [assignment])))

    0.0  1.0  2.0
t0    1    1    3
t1    1    3    1 

0.0    0.0
1.0    0.5
2.0    0.5
dtype: float64
