Lots of MS work towards loo kernel size selection working with Fisher…

… distributions - it does not, but many bugs squashed. At least two more to go though... Conflicts: utils_gui/tile_value.py
thaines · Feb 27, 2016 · fa58f68 · fa58f68
1 parent 7d53890
commit fa58f68
Show file tree

Hide file tree

Showing 8 changed files with 456 additions and 20 deletions.
diff --git a/ms/kernels.c b/ms/kernels.c
@@ -450,7 +450,7 @@ float Cosine_norm(int dims, KernelConfig config)
  float mult = Uniform_norm(dims, NULL) / dims;
 
  int i;
- for (i=2; i<dims; i++) mult /= i; 
+ for (i=2; i<dims; i++) mult /= i;
 
  int k;
  float sum = 0.0;
@@ -460,10 +460,18 @@ float Cosine_norm(int dims, KernelConfig config)
   float fact = 1.0;
   for (i=2; i<(dims-2*k); i++) fact *= i;
 
-  sum += dir * pow(0.5*M_PI, 1+2*k) / fact; 
+  sum += dir * pow(0.5*M_PI, -(1+2*k)) / fact; 
 
   dir *= -1;
  }
+
+ if ((dims>=2)&&((dims&1)==0)) // The extra term that appears because 0^0==1, but only when dims is even and 2 or greater.
+ {
+  if (((dims-2)/2)&1) dir = -1;
+                 else dir =  1;
+
+  sum -= dir * pow(0.5*M_PI, -dims);
+ }
 
  return mult / sum;
 }
@@ -649,16 +657,17 @@ float Cauchy_norm(int dims, KernelConfig config)
 {
  float ret = 0.0;
 
- // Can't integrate out analytically, so numerical integration it is...
+ // Can't integrate out analytically, so numerical integration it is (match the range of high quality)...
   int i;
   const int samples = 1024;
+  const float step = 6.0 / samples;
   for (i=0; i<samples; i++)
   {
-   float r = (i+0.5) / samples;
-   ret += pow(r, dims-1) / ((1.0+r*r) * samples);
+   float r = (i+0.5) * step;
+   ret += pow(r, dims-1) * step / (1.0+r*r);
   }
 
- return ret * Uniform_norm(dims, NULL) / dims;
+ return Uniform_norm(dims, NULL) / (dims * ret);
 }
 
 float Cauchy_range(int dims, KernelConfig config, float quality)
@@ -773,9 +782,15 @@ KernelConfig Fisher_config_new(int dims, const char * config)
 
  // Basic value storage...
   ret->ref_count = 1;
-  ret->alpha = atof(config+1); // +1 to skip the '('.
-
-  int approximate = ret->alpha > CONC_SWITCH; // Should add override option; will require updating verify method. ************************************************
+  char * end;
+  ret->alpha = strtof(config+1, &end); // +1 to skip the '('.
+
+  int approximate = ret->alpha > CONC_SWITCH;
+  if (end!=NULL)
+  {
+   if (*end=='c') approximate = 0;
+   if (*end=='a') approximate = 1;
+  }
 
  // Record the log of the normalising constant - we return normalised values for this distribution for reasons of numerical stability...
   const float log_2PI = log(2 * M_PI);
@@ -892,6 +907,11 @@ const char * Fisher_config_verify(int dims, const char * config, int * length)
  if (end==config) return "No concentration parameter given to von-Mises Fisher distribution.";
  if (conc<0.0) return "Negative concentration parameter given to von-Mises Fisher distribution.";
 
+ if ((end!=NULL)&&((*end=='a')||(*end=='c')))
+ {
+  ++end; // Skip a mode forcer.
+ }
+
  if ((end==NULL)||(*end!=')')) return "von-Mises Fisher configuration did not end with a ).";
 
  if (length!=NULL)
@@ -967,6 +987,8 @@ float Fisher_range(int dims, KernelConfig config, float quality)
 {
  FisherConfig * self = (FisherConfig*)config;
 
+ return 2.0; // ******************************************************
+
  if (self->inv_culm==NULL)
  {
   float sd = 1.0 + 2.0 * quality;
@@ -1138,8 +1160,8 @@ void Fisher_mult_draw(int dims, KernelConfig config, int terms, const float ** f
 const Kernel Fisher =
 {
  "fisher",
- "A kernel for dealing with directional data, using the von-Mises Fisher distribution as a kernel (Fisher is technically 3 dimensions only, but I like short names! - this will do anything from 2 dimensions upwards). Requires that all feature vectors be on the unit-hypersphere, plus it uses the alpha parameter provided to the kernel as the concentration parameter of the distribution. Be careful with the merge_range parameter when using this kernel - unlike the other kernels it won't default to anything sensible, and will have to be manually set. Suffers from the problem that you must use the same kernel for multiplication, so you can only multiply distributions with the same concentration value.",
- "Specified as fisher(alpha), e.g. fisher(10), where alpha is the concentration parameter",
+ "A kernel for dealing with directional data, using the von-Mises Fisher distribution as a kernel (Fisher is technically 3 dimensions only, but I like short names! - this will do anything from 2 dimensions upwards. Dimensions is the embeding space, as in the length of the unit vector, to match the rest of the system - the degrees of freedom is one less.). Requires that all feature vectors be on the unit-hypersphere (does not check this - gigo), plus it uses the alpha parameter provided to the kernel as the concentration parameter of the distribution. Be careful with the merge_range parameter when using this kernel - unlike the other kernels it won't default to anything sensible, and will have to be manually set. Suffers from the problem that you must use the same kernel for multiplication, so you can only multiply distributions with the same concentration value.",
+ "Specified as fisher(alpha), e.g. fisher(10), where alpha is the concentration parameter. Can optionally include a letter immediatly after the alpha value - either 'a' to force approximate mode or 'c' to force correct mode, e.g. 'fisher(64.0a)'. Note that this is generally not a good idea - the approximation breaks down for low concentration and the correct approach numerically explodes for high concentration - the default behaviour automatically takes this into account and selects the right one.",
  Fisher_config_new,
  Fisher_config_verify,
  Fisher_config_acquire,
@@ -1170,7 +1192,7 @@ float MirrorFisher_weight(int dims, KernelConfig config, float * offset)
  if (self->inv_culm==NULL)
  {
   // Ammusingly the approximation is actually marginally simpler in the mirrored case!..
-  return exp(-0.5 * self->alpha * (1.0 - cos_ang*cos_ang) + self->log_norm); 
+  return 0.5 * exp(-0.5 * self->alpha * (1.0 - cos_ang*cos_ang) + self->log_norm); 
  }
  else
  {
@@ -1250,7 +1272,7 @@ const Kernel MirrorFisher =
 {
  "mirror_fisher",
  "A kernel for dealing with directional data where a 180 degree rotation is meaningless; one specific use case is for rotations expressed with unit quaternions. It wraps the Fisher distribution such that it is an even mixture of two of them - one with the correct unit vector, one with its negation. All other behaviours and requirements are basically the same as a Fisher distribution however. Suffers from the problem that you must use the same kernel for multiplication, so you can only multiply distributions with the same concentration value.",
- "Specified as mirror_fisher(alpha), e.g. mirror_fisher(10), where alpha is the concentration parameter used for both of the von-Mises Fisher distributions.",
+ "Specified as mirror_fisher(alpha), e.g. mirror_fisher(10), where alpha is the concentration parameter used for both of the von-Mises Fisher distributions. Same as fisher you can immediatly postcede the concentration with 'a' to force it to be approximate or 'c' to force it to be correct.",
  Fisher_config_new,
  Fisher_config_verify,
  Fisher_config_acquire,

diff --git a/ms/mean_shift.c b/ms/mean_shift.c
@@ -65,9 +65,8 @@ float prob(Spatial spatial, const Kernel * kernel, KernelConfig config, const fl
 
    float w;
    float * loc = DataMatrix_fv(dm, targ, &w);
-
-   int i;
-   for (i=0; i<feats; i++) loc[i] -= fv[i];
+
+   kernel->to_offset(feats, config, loc, fv);
    w *= kernel->weight(feats, config, loc);
 
    w *= norm;
@@ -133,8 +132,8 @@ float loo_nll(Spatial spatial, const Kernel * kernel, KernelConfig config, float
 
      float w;
      float * fv = DataMatrix_fv(dm, targ, &w);
-   
-     for (j=0; j<features; j++) fv[j] -= fvi[j];
+
+     kernel->to_offset(features, config, fv, fvi);
      w *= kernel->weight(features, config, fv);
 
      w *= norm;

diff --git a/ms/ms.py b/ms/ms.py
@@ -35,6 +35,7 @@ def scale_loo_nll(self, low = 0.01, high = 2.0, steps = 64, callback = None):
     # Select values for low and high as needed...
     if isinstance(low, float) or isinstance(high, float):
       _, silverman = self.stats()
+      silverman[silverman<1e-6] = 1e-6
       silverman = 1.0 / (silverman * (self.weight() * (silverman.shape[0] + 2.0) / 4.0) ** (-1.0 / (silverman.shape[0] + 4.0)))
 
       if isinstance(low, float):
@@ -70,12 +71,40 @@ def scale_loo_nll(self, low = 0.01, high = 2.0, steps = 64, callback = None):
     return best_score
 
 
+  def scale_loo_nll_array(self, choices, callback = None):
+    """Given an array of MS objects this copies in the configuration of each object in turn into this object and finds the one that minimises the leave one out error. Quite simple really - mostly for use in cases when the kernel type doesn't support scale in the usual way, i.e. the directional kernels. For copying across it uses a call to copy_all and a call to copy_scale, the last only if it actually has a scale (data is set!), which between them get near as everything."""
+    best_choice = None
+    best_score = None
+
+    for i, choice in enumerate(choices):
+      if callback!=None:
+        callback(i, len(choices))
+
+      self.copy_all(choice)
+      if choice.get_dm()!=None:
+        self.copy_scale(choice)
+
+      score = self.loo_nll()
+
+      print choice.get_kernel(), score #####################################
+
+      if best_score==None or score < best_score:
+        best_choice = choice
+        best_score = score
+
+    self.copy_all(best_choice)
+    if best_choice.get_dm()!=None:
+      self.copy_scale(best_choice)
+    return best_score
+
+
   def hierarchy(self, low = 1.0, high = 512.0, steps = 64, callback = None):
     """Does a sweep of scale, exactly like scale_loo_nll (same behaviour for low and high with vector vs single value), except it clusters the data at each level and builds a hierarchy of clusters, noting which cluster at a lower level ends up in which cluster at the next level. Note that low and high are inverted before use so that they equate to the typical mean shift parameters. Return is a list indexed by level, with index 0 representing the original data (where every data point is its own segment). Each level is represented by a tuple: (modes - array of [segment, feature], parents - array of [segment], giving the index of its parent segment in the next level. None in the highest level, sizes - array of [segment] giving the total weight of all exemplars in that segment.)"""
 
     # Select values for low and high as needed...
     if isinstance(low, float) or isinstance(high, float):
       _, silverman = self.stats()
+      silverman[silverman<1e-6] = 1e-6
       silverman = 1.0 / (silverman * (self.weight() * (silverman.shape[0] + 2.0) / 4.0) ** (-1.0 / (silverman.shape[0] + 4.0)))
 
       if isinstance(low, float):

diff --git a/ms/ms_c.c b/ms/ms_c.c
@@ -109,6 +109,14 @@ static PyObject * MeanShift_get_kernel_py(MeanShift * self, PyObject * args)
 }
 
 
+static PyObject * MeanShift_get_range_py(MeanShift * self, PyObject * args)
+{
+ int dims = DataMatrix_features(&self->dm);
+ float range = self->kernel->range(dims, self->config, self->quality);
+ return Py_BuildValue("f", range);
+}
+
+
 static PyObject * MeanShift_set_kernel_py(MeanShift * self, PyObject * args)
 {
  // Parse the parameters...
@@ -768,6 +776,39 @@ static PyObject * MeanShift_get_weight_scale_py(MeanShift * self, PyObject * arg
 }
 
 
+static PyObject * MeanShift_copy_scale_py(MeanShift * self, PyObject * args)
+{
+ // Get the parameters - another mean shift object...
+  MeanShift * other;
+  if (!PyArg_ParseTuple(args, "O!", &MeanShiftType, &other)) return NULL;
+
+ // Copy over the scale...  
+  DataMatrix_set_scale(&self->dm, other->dm.mult, other->dm.weight_scale);
+
+ // Trash the spatial - changing either of the above invalidates it...
+  if (self->spatial!=NULL)
+  {
+   Spatial_delete(self->spatial);
+   self->spatial = NULL; 
+  }
+
+ // Trash the cluster centers...
+  if (self->balls!=NULL)
+  {
+   Balls_delete(self->balls);
+   self->balls = NULL;
+  }
+
+ // Trash the weight record...
+  self->weight = -1.0;
+  self->norm = -1.0;
+
+ // Return None...
+  Py_INCREF(Py_None);
+  return Py_None;
+}
+
+
 
 static PyObject * MeanShift_exemplars_py(MeanShift * self, PyObject * args)
 {
@@ -889,6 +930,7 @@ static PyObject * MeanShift_scale_silverman_py(MeanShift * self, PyObject * args
    for (i=0; i<features; i++)
    {
     sd[i] = sqrt(sd[i] / weight);
+    if (sd[i]<1e-6) sd[i] = 1e-6;
    }
 
   // Convert standard deviations into a bandwidth via applying the rule...
@@ -968,6 +1010,7 @@ static PyObject * MeanShift_scale_scott_py(MeanShift * self, PyObject * args)
    for (i=0; i<features; i++)
    {
     sd[i] = sqrt(sd[i] / weight);
+    if (sd[i]<1e-6) sd[i] = 1e-6;
    }
 
   // Convert standard deviations into a bandwidth via applying the rule...
@@ -2155,8 +2198,11 @@ static PyMethodDef MeanShift_methods[] =
 {
  {"kernels", (PyCFunction)MeanShift_kernels_py, METH_NOARGS | METH_STATIC, "A static method that returns a list of kernel types, as strings."},
  {"get_kernel", (PyCFunction)MeanShift_get_kernel_py, METH_NOARGS, "Returns the string that identifies the current kernel; for complex kernels this may be a complex string containing parameters etc."},
+ {"get_range", (PyCFunction)MeanShift_get_range_py, METH_NOARGS, "Returns the range of the current kernel, taking into account the current quality value - this is how far out it searches for relevant exemplars from a point in space, eucledian distance. Note that the range is the raw internal value - you need to divide by the scale vector to get the true scale for each dimension. Provided for diagnostic purposes."},
  {"set_kernel", (PyCFunction)MeanShift_set_kernel_py, METH_VARARGS, "Sets the current kernel, as identified by a string. For complex kernels this will probably need to include extra information - e.g. the fisher kernel is given as fisher(alpha) where alpha is a floating point concentration parameter. Note that some kernels (e.g. fisher) take into account the number of features in the data when set - in such cases you must set the kernel type after calling set_data."},
  {"copy_kernel", (PyCFunction)MeanShift_copy_kernel_py, METH_VARARGS, "Given another MeanShift object this copies the settings from it. This is highly recomended when speed matters and you have lots of kernels, as it copies pointers to the internal configuration object and reference counts - for objects with complex configurations this can be an order of magnitude faster. It can also save a lot of memory, via shared caches."},
+
+
  {"link_rng", (PyCFunction)MeanShift_link_rng_py, METH_VARARGS, "Links this MeanShift object to use the same random number generator as the given MeanShift object, or unlinks the RNG if no argument given. Will do path shortening if you attempt to chain a bunch together. Will ignore attempts to create loops. The other MeanShift does not even have to be initialised - they don't need to share any common characteristics. Setting the rng* values of a linked MeanShift object acheives nothing."},
 
  {"spatials", (PyCFunction)MeanShift_spatials_py, METH_NOARGS | METH_STATIC, "A static method that returns a list of spatial indexing structures you can use, as strings."},
@@ -2184,6 +2230,7 @@ static PyMethodDef MeanShift_methods[] =
  {"set_scale", (PyCFunction)MeanShift_set_scale_py, METH_VARARGS, "Given two parameters. First is an array indexed by feature to get a multiplier that is applied before the kernel (Which is always of radius 1, or some approximation of.) is considered - effectivly an inverse bandwidth in kernel density estimation terms or the inverse standard deviation if you are using the Gaussian kernel. Second is an optional scale for the weight assigned to each feature vector via the set_data method (In the event that no weight is assigned this parameter is the weight of each feature vector, as the default is 1)."},
  {"get_scale", (PyCFunction)MeanShift_get_scale_py, METH_NOARGS, "Returns a copy of the scale array (Inverse bandwidth)."},
  {"get_weight_scale", (PyCFunction)MeanShift_get_weight_scale_py, METH_NOARGS, "Returns the scalar for the weight of each sample - typically left as 1."},
+ {"copy_scale", (PyCFunction)MeanShift_copy_scale_py, METH_VARARGS, "Given anotehr MeanShift object this copies its scale parameters - a touch faster than using get then set as it avoids an intermediate matrix."},
 
  {"exemplars", (PyCFunction)MeanShift_exemplars_py, METH_NOARGS, "Returns how many exemplars are in the hallucinated data matrix."},
  {"features", (PyCFunction)MeanShift_features_py, METH_NOARGS, "Returns how many features are in the hallucinated data matrix."},
@@ -2198,8 +2245,8 @@ static PyMethodDef MeanShift_methods[] =
  {"probs", (PyCFunction)MeanShift_probs_py, METH_VARARGS, "Given a data matrix returns an array (1D) containing the probability of each feature, as calculated by the kernel density estimate that is defined by the data and kernel. Be warned that the return value can be zero."},
 
  {"draw", (PyCFunction)MeanShift_draw_py, METH_NOARGS, "Allows you to draw from the distribution represented by the kernel density estimate. Returns a vector and makes use of the internal RNG."},
- {"draws", (PyCFunction)MeanShift_draws_py, METH_VARARGS, "Allows you to draw from the distribution represented by the kernel density estimate. Same as draw except it returns a matrix - you provide a single argument of how many draws to make. Returns an array, <# draws>X<# exemplars> and makes use of the internal RNG."},
- {"bootstrap", (PyCFunction)MeanShift_bootstrap_py, METH_VARARGS, "Does a bootstrap draw from the samples - essentially the same as draws but assuming a Dirac delta function for the kernel. You provide the number of draws; it returns an array, <# draws>X<# exemplars>. Makes use of the contained Philox RNG."},
+ {"draws", (PyCFunction)MeanShift_draws_py, METH_VARARGS, "Allows you to draw from the distribution represented by the kernel density estimate. Same as draw except it returns a matrix - you provide a single argument of how many draws to make. Returns an array, <# draws>X<# features> and makes use of the internal RNG."},
+ {"bootstrap", (PyCFunction)MeanShift_bootstrap_py, METH_VARARGS, "Does a bootstrap draw from the samples - essentially the same as draws but assuming a Dirac delta function for the kernel. You provide the number of draws; it returns an array, <# draws>X<# features>. Makes use of the contained Philox RNG."},
 
  {"mode", (PyCFunction)MeanShift_mode_py, METH_VARARGS, "Given a feature vector returns its mode as calculated using mean shift - essentially the maxima in the kernel density estimate to which you converge by climbing the gradient."},
  {"modes", (PyCFunction)MeanShift_modes_py, METH_VARARGS, "Given a data matrix [exemplar, feature] returns a matrix of the same size, where each feature has been replaced by its mode, as calculated using mean shift."},