In [26]:
import pandas as pd
import numpy as np
from itertools import combinations
from statsmodels.formula.api import ols

In [3]:
np.random.seed(0)

df = pd.DataFrame(data=np.random.rand(10, 2), columns=['a', 'b'])
assignment = set(np.random.choice(df.index, size=5, replace=False))

In [4]:
df_a = df.loc[df.index.isin(assignment)]
df_b = df.loc[~df.index.isin(assignment)]

In [5]:
df.cov()

Unnamed: 0,a,b
a,0.090525,0.006
b,0.006,0.070385


In [6]:
inverse_cov = np.linalg.inv(df.cov())
inverse_cov.shape

(2, 2)

In [7]:
inverse_cov

array([[11.10942969, -0.94700726],
       [-0.94700726, 14.28826789]])

In [8]:
mean_diff = df_a.mean() - df_b.mean()
mean_diff

a    0.227918
b   -0.167057
dtype: float64

In [9]:
(inverse_cov @ mean_diff)

array([ 2.69023848, -2.60279472])

In [10]:
import types
import collections
import numbers

In [11]:
np.add(*[df_a, df_b])

Unnamed: 0,a,b
4,1.512476,1.098631
5,1.394488,1.073778
6,0.991699,1.571491
7,0.508623,0.978902
9,0.798375,1.702632


In [12]:
def assignment_indices(df, assignments):
    idxs = [df.index.isin(a) for a in assignments]
    total_assignments = np.add(*idxs) if len(idxs) > 1 else idxs[0]
    if not total_assignments.min():
        idxs.append(~total_assignments)
    return idxs

assignment_indices(df, [assignment, [2, 3]])

[array([False, False, False, False,  True,  True,  True,  True, False,
         True]),
 array([False, False,  True,  True, False, False, False, False, False,
        False]),
 array([ True,  True, False, False, False, False, False, False,  True,
        False])]

In [13]:
def mahalanobis(df, assignments, agg_assign=np.max):
    means = [df.loc[idx].mean() 
             for idx in assignment_indices(df, assignments)]
    inverse_cov = np.linalg.inv(df.cov())
    mean_diffs = [m1 - m2 for m1, m2 in combinations(means, 2)]    
    return agg_assign([mean_diff @ inverse_cov @ mean_diff for mean_diff in mean_diffs])

In [17]:
mahalanobis(df, [assignment, [2, 3]], agg_assign=lambda x: x)

[1.1719512986317402, 0.999298988844019, 0.08496754158529106]

In [111]:
model = ols('a ~ 1 + t1', data=pd.concat((df, 
                                     pd.DataFrame(df.index.isin(assignment), 
                                                  columns=['t1'])), 
                                     axis=1)
           )    

In [112]:
res = model.fit()

In [113]:
res.pvalues.iloc[1]

0.804341767046624

In [22]:
dict(('t{}'.format(i), df.index.isin(assignment)) for i, assignment in enumerate([assignment, [2, 3]]))

{'t0': array([False, False, False, False,  True,  True,  True,  True, False,
         True]),
 't1': array([False, False,  True,  True, False, False, False, False, False,
        False])}

In [27]:
def ols_on_treatment(col, df, assignments):
    treatment_dummies = pd.DataFrame(
        dict((('t{}'.format(i), df.index.isin(assignment)) for i, assignment in enumerate(assignments)))
    )
    data = pd.concat((df, treatment_dummies), axis=1)
    return ols('{} ~ 1 + {}'.format(col, ' + '.join(treatment_dummies.columns)), 
               data=data).fit()


def pvalues(df, assignment):
    return dict((col, 
                 ols_on_treatment(col, df, assignment).pvalues.iloc[1:].values) for col in df.columns)    

In [31]:
pv = pd.DataFrame(pvalues(df, [assignment, [2, 3]]))
pv

Unnamed: 0,a,b
0,0.320395,0.523023
1,0.892326,0.790063


In [32]:
pv.apply(np.max, axis=0)

a    0.892326
b    0.790063
dtype: float64

In [36]:
pd.DataFrame({'a':[1], 'b':[2]}).apply(np.max, axis=1)

0    2
dtype: int64

In [156]:
class NumericFunction:
    @classmethod
    def numerize(cls, f):
        return NumericFunction(f)
    
    def __init__(self, f):
        self.func = f
        
    def __call__(self, x):
        return self.func(x)
    
    def __add__(self, other):
        return self.numerize(lambda x: self(x) + other(x))
    
    def __radd__(self, other):
        return self.__add__(self, other)
    
    def __mul__(self, other):
        if isinstance(other, numbers.Number):
            return self.numerize(lambda x: self(x) * other)
        else:
            return self.numerize(lambda x: self(x) * other(x))
        
    def __rmul__(self, other):
        return self.__mul__(other)
    
    @property
    def __str__(self):
        return 'NumericFunction: number valued function'

    
@NumericFunction.numerize
def f(x):
    x1, x2 = x
    return 2 * x1 + 3 * x2 

def g(x):
    x1, x2 = x
    return 3 * x1

h = .6 * f + g

In [157]:
h((1, 2))

7.8

In [182]:
df = pd.DataFrame(data=np.random.randint(0, 3, 20), columns=['a'])
assignment = assignment = set(np.random.choice(df.index, size=10, replace=False))
df, assignment

(    a
 0   0
 1   0
 2   2
 3   2
 4   0
 5   0
 6   0
 7   1
 8   2
 9   0
 10  0
 11  1
 12  0
 13  2
 14  1
 15  1
 16  1
 17  0
 18  0
 19  0, {2, 3, 5, 7, 10, 12, 15, 17, 18, 19})

In [194]:
def count_by_col(col, df, assignments):
    cat = sorted(list(set(df[col])))
    idxs = assignment_indices(df, assignments)
    count = [[sum(df[col].loc[idx]==v) for v in cat] for idx in idxs]
        
    return pd.DataFrame(data=count, columns=cat, index=['t{}'.format(i) for i in range(len(idxs))])
    

def relative_block_balance(df_count, agg_assignments=lambda x: np.max(np.abs(x),axis=0)):
    relative_dev = (df_count - df_count.median())/df_count.median()
    return agg_assignments(relative_dev)
                           
print(count_by_col('a', df, [assignment]), '\n')
print(relative_block_balance(count_by_col('a', df, [assignment])))

    0  1  2
t0  6  2  2
t1  5  3  2 

0    0.090909
1    0.200000
2    0.000000
dtype: float64


In [172]:
from itertools import product
[(a, b) for a, b in product(['a', 'b'], [1, 2])]

[('a', 1), ('a', 2), ('b', 1), ('b', 2)]

In [173]:
[(a, b) for a in ['a', 'b'] for b in [1, 2]]

[('a', 1), ('a', 2), ('b', 1), ('b', 2)]