Permalink
Browse files

Use efficient O(n) median and selection

Adds dependency on XS module, but speeds up things for
large numbers of timings considerably.
  • Loading branch information...
tsee committed Sep 23, 2010
1 parent ce9e1d0 commit 2559ebeabadbb097d64af58fd8866163696a62c5
Showing with 28 additions and 50 deletions.
  1. +1 −0 Makefile.PL
  2. +13 −48 lib/Dumbbench/Stats.pm
  3. +14 −2 t/010_stats.t
View
@@ -13,6 +13,7 @@ WriteMakefile(
'Capture::Tiny' => '0',
'Params::Util' => '0',
'parent' => '0',
+ 'Statistics::CaseResampling' => '0.04',
}, # e.g., Module::Name => 1.1
($] >= 5.005 ? ## Add these new keywords supported since 5.005
(ABSTRACT_FROM => 'lib/Dumbbench.pm', # retrieve abstract from module
View
@@ -2,16 +2,17 @@ package Dumbbench::Stats;
use strict;
use warnings;
use List::Util ();
+use Statistics::CaseResampling ();
use Class::XSAccessor {
constructor => 'new',
accessors => [qw/data name/],
};
-# note: This is entirely unoptimized. There is a lot of unnecessary
-# sorting going on. This is to allow the user to modify the data
+# Note: This is entirely unoptimized. There is a lot of unnecessary
+# stuff going on. This is to allow the user to modify the data
# set in flight. If this comes back to haunt us at some point,
-# we can still optimize.
+# we can still optimize, but at this point, convenience still wins.
sub sorted_data {
my $self = shift;
@@ -21,47 +22,21 @@ sub sorted_data {
sub first_quartile {
my $self = shift;
- my @data = sort { $a <=> $b } @{$self->data}; # would be much faster to cache the order...
- # inlined median to avoid really silly re-sorting.
- return() if not @data;
-
- my $n = @data;
- splice(@data, int($n/2));
- #splice(@data, 0, int($n/2));
- $n = @data;
- if ($n % 2) { # odd
- return $data[int($n/2)];
- }
- else {
- my $half = $n/2;
- return 0.5*($data[$half]+$data[$half-1]);
- }
+ my $n = $self->n;
+ my $k = int($n/4) + 1;
+ return Statistics::CaseResampling::select_kth($self->data, $k);
}
sub second_quartile { return $_[0]->median }
sub third_quartile {
my $self = shift;
- my @data = sort { $a <=> $b } @{$self->data}; # would be much faster to cache the order...
- # inlined median to avoid really silly re-sorting.
- return() if not @data;
-
- my $n = @data;
- #splice(@data, int($n/2));
- splice(@data, 0, int($n/2));
- $n = @data;
- if ($n % 2) { # odd
- return $data[int($n/2)];
- }
- else {
- my $half = $n/2;
- return 0.5*($data[$half]+$data[$half-1]);
- }
+ my $n = $self->n;
+ my $k = int($n*3/4) + 1;
+ return Statistics::CaseResampling::select_kth($self->data, $k);
}
-sub n {
- return scalar(@{$_[0]->data});
-}
+sub n { scalar(@{$_[0]->data}) }
sub sum {
my $self = shift;
@@ -85,17 +60,7 @@ sub mean {
sub median {
my $self = shift;
- my @data = sort { $a <=> $b } @{$self->data}; # would be much faster to cache the order...
- #@$data = sort { $a <=> $b } @$data;
- return() if not @data;
- my $n = @data;
- if ($n % 2) { # odd
- return $data[int($n/2)];
- }
- else {
- my $half = $n/2;
- return 0.5*($data[$half]+$data[$half-1]);
- }
+ return Statistics::CaseResampling::median($self->data); # O(n)
}
sub mad {
@@ -113,7 +78,7 @@ sub mad_dev {
sub std_dev {
my $self = shift;
my $data = $self->data;
- my $mean = $self->mean();
+ my $mean = $self->mean;
my $var = 0;
$var += ($_-$mean)**2 for @$data;
$var /= @$data - 1;
View
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 15;
+use Test::More tests => 21;
use Dumbbench;
use Dumbbench::Stats;
@@ -15,7 +15,19 @@ is_approx($s->median, 3);
push @$data, 12;
is_approx($s->mean, (1+2+3+4+5+12)/6);
-is_approx($s->median, 3.5);
+is_approx($s->median, 3);
+
+push @$data, 13, 14, 15, 16, 17;
+my @sorted = sort { $a <=> $b } @$data;
+is_approx($s->second_quartile, $s->median);
+is_approx($s->first_quartile, $sorted[@sorted/4]);
+is_approx($s->third_quartile, $sorted[@sorted*3/4]);
+
+push @$data, 0.5;
+@sorted = sort { $a <=> $b } @$data;
+is_approx($s->second_quartile, $s->median);
+is_approx($s->first_quartile, $sorted[@sorted/4]);
+is_approx($s->third_quartile, $sorted[@sorted*3/4]);
my $mean = $s->mean;
my $variance = 0;

0 comments on commit 2559ebe

Please sign in to comment.