Skip to content

Commit

Permalink
BUG: Fixes scipygh-12218, TypeError converting int to float inside st…
Browse files Browse the repository at this point in the history
…ats.ks_2samp (scipy#12280)

A multiplication of large integers fed as input to np.sqrt exceeded its ability
to convert to float, generating a TypeError inside stats.ks_2samp() for large
samples.  If both sample sizes were larger than about 2^21 (~cube root of 2^64,
or about 2million), this condition would be triggered.
  • Loading branch information
pvanmulbregt authored and tylerjereddy committed Jul 4, 2020
1 parent 98d21ba commit 0a1b9a8
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
15 changes: 9 additions & 6 deletions scipy/stats/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6702,7 +6702,7 @@ def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):
cdf1 = np.searchsorted(data1, data_all, side='right') / n1
cdf2 = np.searchsorted(data2, data_all, side='right') / n2
cddiffs = cdf1 - cdf2
minS = -np.min(cddiffs)
minS = np.clip(-np.min(cddiffs), 0, 1) # Ensure sign of minS is not negative.
maxS = np.max(cddiffs)
alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': max(minS, maxS)}
d = alt2Dvalue[alternative]
Expand All @@ -6718,8 +6718,8 @@ def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):
if n1g >= np.iinfo(np.int).max / n2g:
mode = 'asymp'
warnings.warn(
"Exact ks_2samp calculation not possible with samples sizes "
"%d and %d. Switching to 'asymp' " % (n1, n2), RuntimeWarning)
f"Exact ks_2samp calculation not possible with samples sizes "
f"{n1} and {n2}. Switching to 'asymp'.", RuntimeWarning)

if mode == 'exact':
success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
Expand All @@ -6731,13 +6731,16 @@ def ks_2samp(data1, data2, alternative='two-sided', mode='auto'):

if mode == 'asymp':
# The product n1*n2 is large. Use Smirnov's asymptoptic formula.
# Ensure float to avoid overflow in multiplication
# sorted because the one-sided formula is not symmetric in n1, n2
m, n = sorted([float(n1), float(n2)], reverse=True)
en = m * n / (m + n)
if alternative == 'two-sided':
en = n1 * n2 / (n1 + n2)
prob = distributions.kstwo.sf(d, np.round(en))
else:
m, n = max(n1, n2), min(n1, n2)
z = np.sqrt(m*n/(m+n)) * d
z = np.sqrt(en) * d
# Use Hodges' suggested approximation Eqn 5.3
# Requires m to be the larger of (n1, n2)
expt = -2 * z**2 - 2 * z * (m + 2*n)/np.sqrt(m*n*(m+n))/3.0
prob = np.exp(expt)

Expand Down
12 changes: 12 additions & 0 deletions scipy/stats/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3265,6 +3265,18 @@ def test_argument_checking(self):
assert_raises(ValueError, stats.ks_2samp, [1], [])
assert_raises(ValueError, stats.ks_2samp, [], [])

def test_gh12218(self):
"""Ensure gh-12218 is fixed."""
# gh-1228 triggered a TypeError calculating sqrt(n1*n2*(n1+n2)).
# n1, n2 both large integers, the product exceeded 2^64
np.random.seed(12345678)
n1 = 2097152 # 2*^21
rvs1 = stats.uniform.rvs(size=n1, loc=0., scale=1)
rvs2 = rvs1 + 1 # Exact value of rvs2 doesn't matter.
stats.ks_2samp(rvs1, rvs2, alternative='greater', mode='asymp')
stats.ks_2samp(rvs1, rvs2, alternative='less', mode='asymp')
stats.ks_2samp(rvs1, rvs2, alternative='two-sided', mode='asymp')


def test_ttest_rel():
# regression test
Expand Down

0 comments on commit 0a1b9a8

Please sign in to comment.