Skip to content

Commit b2cf2ed

Browse files
committed
refactored to use dataframe
1 parent fddb0ec commit b2cf2ed

File tree

3 files changed

+43
-25
lines changed

3 files changed

+43
-25
lines changed

pypair/binary.py

+38-23
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from itertools import chain
22
from math import sqrt, log2, pi
33

4+
import pandas as pd
45
from scipy import stats
56
from scipy.special import binom
67

@@ -26,18 +27,19 @@ def __init__(self, a, b, a_vals=None, b_vals=None):
2627
:param a_vals: All possible values in a. Defaults to `None`.
2728
:param b_vals: All possible values in b. Defaults to `None`.
2829
"""
30+
df = pd.DataFrame([(x, y) for x, y in zip(a, b)], columns=['a', 'b'])
31+
2932
if a_vals is None:
3033
a_vals = sorted(list(set(a)))
3134
else:
32-
a_vals = sorted(list(set(a_vals)))
35+
a_vals = sorted(list(df.a.unique()))
3336

3437
if b_vals is None:
3538
b_vals = sorted(list(set(b)))
3639
else:
37-
b_vals = sorted(list(set(b_vals)))
40+
b_vals = sorted(list(df.b.unique()))
3841

39-
data = [(x, y) for x, y in zip(a, b)]
40-
observed = [[data.count((x, y)) for y in b_vals] for x in a_vals]
42+
observed = [[df.query(f'a=="{x}" and b=="{y}"').shape[0] for y in b_vals] for x in a_vals]
4143

4244
n_rows = len(a_vals)
4345
n_cols = len(b_vals)
@@ -47,13 +49,14 @@ def __init__(self, a, b, a_vals=None, b_vals=None):
4749

4850
n = sum([sum(o) for o in observed])
4951
get_expected = lambda r, c: r * c / n
50-
expected = [[get_expected(row_marginals[i], col_marginals[j]) for j, _ in enumerate(b_vals)] for i, _ in enumerate(a_vals)]
52+
expected = [[get_expected(row_marginals[i], col_marginals[j]) for j, _ in enumerate(b_vals)] for i, _ in
53+
enumerate(a_vals)]
5154

5255
chisq = sum([(o - e) ** 2 / e for o, e in zip(chain(*observed), chain(*expected))])
5356

5457
self.observed = observed
5558
self.expected = expected
56-
self._data = data
59+
self._df = df
5760
self._chisq = chisq
5861
self._n = n
5962
self._a_map = {v: i for i, v in enumerate(a_vals)}
@@ -63,6 +66,18 @@ def __init__(self, a, b, a_vals=None, b_vals=None):
6366
self._row_marginals = row_marginals
6467
self._col_marginals = col_marginals
6568

69+
def _count(self, a=None, b=None):
70+
if a is not None and b is not None:
71+
q = f'a=="{a}" and b=="{b}"'
72+
elif a is not None and b is None:
73+
q = f'a=="{a}"'
74+
elif a is None and b is not None:
75+
q = f'b=="{b}"'
76+
else:
77+
return self._df.shape[0]
78+
79+
return self._df.query(q).shape[0]
80+
6681
@property
6782
def chisq(self):
6883
"""
@@ -231,9 +246,9 @@ def tanimoto_similarity(self):
231246
232247
:return: Tanimoto similarity.
233248
"""
234-
count_11 = self._data.count((self._a_1, self._b_1))
235-
count_01 = self._data.count((self._a_0, self._b_1))
236-
count_10 = self._data.count((self._a_1, self._b_0))
249+
count_11 = self._count(self._a_1, self._b_1)
250+
count_01 = self._count(self._a_0, self._b_1)
251+
count_10 = self._count(self._a_1, self._b_0)
237252
s = count_11 / (count_01 + count_10)
238253
return s
239254

@@ -284,10 +299,10 @@ def rand_index(self):
284299
285300
:return: Rand index.
286301
"""
287-
tp = self._data.count((self._a_1, self._b_1))
288-
fp = self._data.count((self._a_0, self._b_1))
289-
fn = self._data.count((self._a_1, self._b_0))
290-
tn = self._data.count((self._a_0, self._b_0))
302+
tp = self._count(self._a_1, self._b_1)
303+
fp = self._count(self._a_0, self._b_1)
304+
fn = self._count(self._a_1, self._b_0)
305+
tn = self._count(self._a_0, self._b_0)
291306
s = (tp + tn) / (tp + fp + fn + tn)
292307
return s
293308

@@ -298,8 +313,8 @@ def mcnemar_test(self):
298313
299314
:return: A tuple. First element is chi-square test statistics. Second element is p-value.
300315
"""
301-
c = self._data.count((self._a_0, self._b_1))
302-
b = self._data.count((self._a_1, self._b_0))
316+
c = self._count(self._a_0, self._b_1)
317+
b = self._count(self._a_1, self._b_0)
303318
chisq = (b - c) ** 2 / (b + c)
304319
p = 1 - stats.chi2.cdf(chisq, 1)
305320
return chisq, p
@@ -311,10 +326,10 @@ def odds_ratio(self):
311326
312327
:return: Odds ratio.
313328
"""
314-
p_00 = self._data.count((self._a_0, self._b_0)) / self._n
315-
p_01 = self._data.count((self._a_0, self._b_1)) / self._n
316-
p_10 = self._data.count((self._a_1, self._b_0)) / self._n
317-
p_11 = self._data.count((self._a_1, self._b_1)) / self._n
329+
p_00 = self._count(self._a_0, self._b_0) / self._n
330+
p_01 = self._count(self._a_0, self._b_1) / self._n
331+
p_10 = self._count(self._a_1, self._b_0) / self._n
332+
p_11 = self._count(self._a_1, self._b_1) / self._n
318333

319334
ratio = (p_11 * p_00) / (p_10 * p_01)
320335
return ratio
@@ -328,10 +343,10 @@ def tetrachoric_correlation(self):
328343
329344
:return: Tetrachoric correlation.
330345
"""
331-
n_00 = self._data.count((self._a_0, self._b_0))
332-
n_01 = self._data.count((self._a_0, self._b_1))
333-
n_10 = self._data.count((self._a_1, self._b_0))
334-
n_11 = self._data.count((self._a_1, self._b_1))
346+
n_00 = self._count(self._a_0, self._b_0)
347+
n_01 = self._count(self._a_0, self._b_1)
348+
n_10 = self._count(self._a_1, self._b_0)
349+
n_11 = self._count(self._a_1, self._b_1)
335350

336351
if n_10 == 0 or n_01 == 0:
337352
return 1.0

requirements.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@ pep8
77
pyflakes
88
# LIBS
99
numpy
10+
scipy
11+
pandas
1012
# DOCUMENTATION
1113
sphinx
1214
sphinx_rtd_theme
1315
# PUBLISHING
14-
twine
16+
twine
17+
setuptools

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
long_description_content_type='text/markdown',
1515
url='https://github.com/vangj/py-bbn',
1616
keywords=' '.join(['statistics', 'pairwise', 'association', 'correlation', 'measurement', 'strength']),
17-
install_requires=[],
17+
install_requires=['scipy', 'numpy', 'pandas'],
1818
classifiers=[
1919
'Programming Language :: Python :: 3',
2020
'License :: OSI Approved :: Apache Software License',

0 commit comments

Comments
 (0)