1
1
from itertools import chain
2
2
from math import sqrt , log2 , pi
3
3
4
+ import pandas as pd
4
5
from scipy import stats
5
6
from scipy .special import binom
6
7
@@ -26,18 +27,19 @@ def __init__(self, a, b, a_vals=None, b_vals=None):
26
27
:param a_vals: All possible values in a. Defaults to `None`.
27
28
:param b_vals: All possible values in b. Defaults to `None`.
28
29
"""
30
+ df = pd .DataFrame ([(x , y ) for x , y in zip (a , b )], columns = ['a' , 'b' ])
31
+
29
32
if a_vals is None :
30
33
a_vals = sorted (list (set (a )))
31
34
else :
32
- a_vals = sorted (list (set ( a_vals )))
35
+ a_vals = sorted (list (df . a . unique ( )))
33
36
34
37
if b_vals is None :
35
38
b_vals = sorted (list (set (b )))
36
39
else :
37
- b_vals = sorted (list (set ( b_vals )))
40
+ b_vals = sorted (list (df . b . unique ( )))
38
41
39
- data = [(x , y ) for x , y in zip (a , b )]
40
- observed = [[data .count ((x , y )) for y in b_vals ] for x in a_vals ]
42
+ observed = [[df .query (f'a=="{ x } " and b=="{ y } "' ).shape [0 ] for y in b_vals ] for x in a_vals ]
41
43
42
44
n_rows = len (a_vals )
43
45
n_cols = len (b_vals )
@@ -47,13 +49,14 @@ def __init__(self, a, b, a_vals=None, b_vals=None):
47
49
48
50
n = sum ([sum (o ) for o in observed ])
49
51
get_expected = lambda r , c : r * c / n
50
- expected = [[get_expected (row_marginals [i ], col_marginals [j ]) for j , _ in enumerate (b_vals )] for i , _ in enumerate (a_vals )]
52
+ expected = [[get_expected (row_marginals [i ], col_marginals [j ]) for j , _ in enumerate (b_vals )] for i , _ in
53
+ enumerate (a_vals )]
51
54
52
55
chisq = sum ([(o - e ) ** 2 / e for o , e in zip (chain (* observed ), chain (* expected ))])
53
56
54
57
self .observed = observed
55
58
self .expected = expected
56
- self ._data = data
59
+ self ._df = df
57
60
self ._chisq = chisq
58
61
self ._n = n
59
62
self ._a_map = {v : i for i , v in enumerate (a_vals )}
@@ -63,6 +66,18 @@ def __init__(self, a, b, a_vals=None, b_vals=None):
63
66
self ._row_marginals = row_marginals
64
67
self ._col_marginals = col_marginals
65
68
69
+ def _count (self , a = None , b = None ):
70
+ if a is not None and b is not None :
71
+ q = f'a=="{ a } " and b=="{ b } "'
72
+ elif a is not None and b is None :
73
+ q = f'a=="{ a } "'
74
+ elif a is None and b is not None :
75
+ q = f'b=="{ b } "'
76
+ else :
77
+ return self ._df .shape [0 ]
78
+
79
+ return self ._df .query (q ).shape [0 ]
80
+
66
81
@property
67
82
def chisq (self ):
68
83
"""
@@ -231,9 +246,9 @@ def tanimoto_similarity(self):
231
246
232
247
:return: Tanimoto similarity.
233
248
"""
234
- count_11 = self ._data . count (( self ._a_1 , self ._b_1 ) )
235
- count_01 = self ._data . count (( self ._a_0 , self ._b_1 ) )
236
- count_10 = self ._data . count (( self ._a_1 , self ._b_0 ) )
249
+ count_11 = self ._count ( self ._a_1 , self ._b_1 )
250
+ count_01 = self ._count ( self ._a_0 , self ._b_1 )
251
+ count_10 = self ._count ( self ._a_1 , self ._b_0 )
237
252
s = count_11 / (count_01 + count_10 )
238
253
return s
239
254
@@ -284,10 +299,10 @@ def rand_index(self):
284
299
285
300
:return: Rand index.
286
301
"""
287
- tp = self ._data . count (( self ._a_1 , self ._b_1 ) )
288
- fp = self ._data . count (( self ._a_0 , self ._b_1 ) )
289
- fn = self ._data . count (( self ._a_1 , self ._b_0 ) )
290
- tn = self ._data . count (( self ._a_0 , self ._b_0 ) )
302
+ tp = self ._count ( self ._a_1 , self ._b_1 )
303
+ fp = self ._count ( self ._a_0 , self ._b_1 )
304
+ fn = self ._count ( self ._a_1 , self ._b_0 )
305
+ tn = self ._count ( self ._a_0 , self ._b_0 )
291
306
s = (tp + tn ) / (tp + fp + fn + tn )
292
307
return s
293
308
@@ -298,8 +313,8 @@ def mcnemar_test(self):
298
313
299
314
:return: A tuple. First element is chi-square test statistics. Second element is p-value.
300
315
"""
301
- c = self ._data . count (( self ._a_0 , self ._b_1 ) )
302
- b = self ._data . count (( self ._a_1 , self ._b_0 ) )
316
+ c = self ._count ( self ._a_0 , self ._b_1 )
317
+ b = self ._count ( self ._a_1 , self ._b_0 )
303
318
chisq = (b - c ) ** 2 / (b + c )
304
319
p = 1 - stats .chi2 .cdf (chisq , 1 )
305
320
return chisq , p
@@ -311,10 +326,10 @@ def odds_ratio(self):
311
326
312
327
:return: Odds ratio.
313
328
"""
314
- p_00 = self ._data . count (( self ._a_0 , self ._b_0 ) ) / self ._n
315
- p_01 = self ._data . count (( self ._a_0 , self ._b_1 ) ) / self ._n
316
- p_10 = self ._data . count (( self ._a_1 , self ._b_0 ) ) / self ._n
317
- p_11 = self ._data . count (( self ._a_1 , self ._b_1 ) ) / self ._n
329
+ p_00 = self ._count ( self ._a_0 , self ._b_0 ) / self ._n
330
+ p_01 = self ._count ( self ._a_0 , self ._b_1 ) / self ._n
331
+ p_10 = self ._count ( self ._a_1 , self ._b_0 ) / self ._n
332
+ p_11 = self ._count ( self ._a_1 , self ._b_1 ) / self ._n
318
333
319
334
ratio = (p_11 * p_00 ) / (p_10 * p_01 )
320
335
return ratio
@@ -328,10 +343,10 @@ def tetrachoric_correlation(self):
328
343
329
344
:return: Tetrachoric correlation.
330
345
"""
331
- n_00 = self ._data . count (( self ._a_0 , self ._b_0 ) )
332
- n_01 = self ._data . count (( self ._a_0 , self ._b_1 ) )
333
- n_10 = self ._data . count (( self ._a_1 , self ._b_0 ) )
334
- n_11 = self ._data . count (( self ._a_1 , self ._b_1 ) )
346
+ n_00 = self ._count ( self ._a_0 , self ._b_0 )
347
+ n_01 = self ._count ( self ._a_0 , self ._b_1 )
348
+ n_10 = self ._count ( self ._a_1 , self ._b_0 )
349
+ n_11 = self ._count ( self ._a_1 , self ._b_1 )
335
350
336
351
if n_10 == 0 or n_01 == 0 :
337
352
return 1.0
0 commit comments