In [2]:
import pandas as pd
import numpy as np

In [16]:
np.random.seed(0)

df = pd.DataFrame(data=np.random.rand(20, 3), columns=['a', 'b', 'c'])

In [17]:
assignment = set(np.random.choice(df.index, size=10, replace=False))

In [18]:
df_a = df.loc[df.index.isin(assignment)]
df_b = df.loc[~df.index.isin(assignment)]

In [31]:
df.cov()

Unnamed: 0,a,b,c
a,0.056018,0.004226,0.009859
b,0.004226,0.097207,-0.013605
c,0.009859,-0.013605,0.083588


In [32]:
inverse_cov = np.linalg.inv(df.cov())
inverse_cov.shape

(3, 3)

In [36]:
inverse_cov

array([[18.34946136, -1.12625407, -2.34756231],
       [-1.12625407, 10.59628345,  1.85754421],
       [-2.34756231,  1.85754421, 12.54260151]])

In [27]:
mean_diff = df_a.mean() - df_b.mean()
mean_diff

a    0.027293
b    0.116936
c    0.094978
dtype: float64

In [28]:
mean_diff.

a    0.027293
b    0.116936
c    0.094978
dtype: float64

In [41]:
(inverse_cov @ mean_diff)

array([0.14614216, 1.38476993, 1.34441687])

In [61]:
import types
import collections
import numbers

In [56]:
from itertools import combinations
isinstance(combinations(['a', 'b', 'c'], 2), collections.abc.Generator)

False

In [69]:
np.add(*[df_a, df_b])

Unnamed: 0,a,b,c
2,0.986401,1.606962,1.566426
4,1.112928,1.349251,0.71693
6,1.161598,1.661737,1.507513
7,0.886288,0.481698,1.613149
8,1.062943,1.161769,0.558015
10,0.946376,1.133742,0.893182
11,1.239072,0.229172,0.746562
12,0.927524,0.980645,1.513945
14,1.136233,1.048599,0.768812
19,0.462168,0.62762,0.897534


In [70]:
min(True, False)

False

In [86]:
def assignment_indices(df, assignments):
    idxs = [df.index.isin(a) for a in assignments]
    total_assignments = np.add(*idxs) if len(idxs) > 1 else idxs[0]
    if not total_assignments.min():
        idxs.append(~total_assignments)
    return idxs

assignment_indices(df, [assignment, [3, 5]])

[array([False, False,  True, False,  True, False,  True,  True,  True,
        False,  True,  True,  True, False,  True, False, False, False,
        False,  True]),
 array([False, False, False,  True, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False]),
 array([ True,  True, False, False, False, False, False, False, False,
         True, False, False, False,  True, False,  True,  True,  True,
         True, False])]

In [89]:
def mahalanobis(df, assignments, agg_assign=np.max):
    means = [df.loc[idx].mean() 
             for idx in assignment_indices(df, assignments)]
    inverse_cov = np.linalg.inv(df.cov())
    mean_diffs = [m1 - m2 for m1, m2 in combinations(means, 2)]    
    return agg_assign([mean_diff @ inverse_cov @ mean_diff for mean_diff in mean_diffs])

In [90]:
mahalanobis(df, [assignment])

0.2936079472756157

In [91]:
from statsmodels.formula.api import ols

In [111]:
model = ols('a ~ 1 + t1', data=pd.concat((df, 
                                     pd.DataFrame(df.index.isin(assignment), 
                                                  columns=['t1'])), 
                                     axis=1)
           )    

In [112]:
res = model.fit()

In [113]:
res.pvalues.iloc[1]

0.804341767046624

In [134]:
def ols_on_treatment(col, df, assignments):
    treatment_dummies = pd.DataFrame(
        dict((('t{}'.format(i), df.index.isin(assignment)) for i, assignment in enumerate(assignments)))
    )
    data = pd.concat((df, treatment_dummies), axis=1)
    return ols('{} ~ 1 + {}'.format(col, ' + '.join(treatment_dummies.columns)), 
               data=data).fit()


def pvalues(df, assignment):
    return dict((col, 
                 ols_on_treatment(col, df, assignment).pvalues.iloc[1:].values) for col in df.columns)    

In [135]:
pvalues(df, [assignment, [3, 5]])

{'a': array([0.75684724, 0.10910286]),
 'b': array([0.5041254 , 0.81092755]),
 'c': array([0.31942912, 0.32077432])}

In [156]:
class NumericFunction:
    @classmethod
    def numerize(cls, f):
        return NumericFunction(f)
    
    def __init__(self, f):
        self.func = f
        
    def __call__(self, x):
        return self.func(x)
    
    def __add__(self, other):
        return self.numerize(lambda x: self(x) + other(x))
    
    def __radd__(self, other):
        return self.__add__(self, other)
    
    def __mul__(self, other):
        if isinstance(other, numbers.Number):
            return self.numerize(lambda x: self(x) * other)
        else:
            return self.numerize(lambda x: self(x) * other(x))
        
    def __rmul__(self, other):
        return self.__mul__(other)
    
    @property
    def __str__(self):
        return 'NumericFunction: number valued function'

    
@NumericFunction.numerize
def f(x):
    x1, x2 = x
    return 2 * x1 + 3 * x2 

def g(x):
    x1, x2 = x
    return 3 * x1

h = .6 * f + g

In [157]:
h((1, 2))

7.8

In [182]:
df = pd.DataFrame(data=np.random.randint(0, 3, 20), columns=['a'])
assignment = assignment = set(np.random.choice(df.index, size=10, replace=False))
df, assignment

(    a
 0   0
 1   0
 2   2
 3   2
 4   0
 5   0
 6   0
 7   1
 8   2
 9   0
 10  0
 11  1
 12  0
 13  2
 14  1
 15  1
 16  1
 17  0
 18  0
 19  0, {2, 3, 5, 7, 10, 12, 15, 17, 18, 19})

In [194]:
def count_by_col(col, df, assignments):
    cat = sorted(list(set(df[col])))
    idxs = assignment_indices(df, assignments)
    count = [[sum(df[col].loc[idx]==v) for v in cat] for idx in idxs]
        
    return pd.DataFrame(data=count, columns=cat, index=['t{}'.format(i) for i in range(len(idxs))])
    

def relative_block_balance(df_count, agg_assignments=lambda x: np.max(np.abs(x),axis=0)):
    relative_dev = (df_count - df_count.median())/df_count.median()
    return agg_assignments(relative_dev)
                           
print(count_by_col('a', df, [assignment]), '\n')
print(relative_block_balance(count_by_col('a', df, [assignment])))

    0  1  2
t0  6  2  2
t1  5  3  2 

0    0.090909
1    0.200000
2    0.000000
dtype: float64


In [172]:
from itertools import product
[(a, b) for a, b in product(['a', 'b'], [1, 2])]

[('a', 1), ('a', 2), ('b', 1), ('b', 2)]

In [173]:
[(a, b) for a in ['a', 'b'] for b in [1, 2]]

[('a', 1), ('a', 2), ('b', 1), ('b', 2)]