Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

LIBSVM Plus 2.9

  • Loading branch information...
commit 9b87d3dd372cf3b7cb7064a55bbb656edee2f773 0 parents
@vincenzo authored
30 COPYRIGHT
@@ -0,0 +1,30 @@
+Copyright (c) 2000-2009 Chih-Chung Chang and Chih-Jen Lin
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither name of copyright holders nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19 Makefile
@@ -0,0 +1,19 @@
+CXX? = g++
+CFLAGS = -Wall -Wconversion -O3 -fPIC
+SHVER = 1
+
+all: svm-train svm-predict svm-scale
+
+lib: svm.o
+ $(CXX) -shared svm.o -o libsvm.so.$(SHVER)
+
+svm-predict: svm-predict.c svm.o
+ $(CXX) $(CFLAGS) svm-predict.c svm.o -o svm-predict -lm
+svm-train: svm-train.c svm.o
+ $(CXX) $(CFLAGS) svm-train.c svm.o -o svm-train -lm
+svm-scale: svm-scale.c
+ $(CXX) $(CFLAGS) svm-scale.c -o svm-scale
+svm.o: svm.cpp svm.h
+ $(CXX) $(CFLAGS) -c svm.cpp
+clean:
+ rm -f *~ svm.o svm-train svm-predict svm-scale
652 README
@@ -0,0 +1,652 @@
+Libsvm is a simple, easy-to-use, and efficient software for SVM
+classification and regression. It solves C-SVM classification, nu-SVM
+classification, one-class-SVM, epsilon-SVM regression, and nu-SVM
+regression. It also provides an automatic model selection tool for
+C-SVM classification. This document explains the use of libsvm.
+
+Libsvm is available at
+http://www.csie.ntu.edu.tw/~cjlin/libsvm
+Please read the COPYRIGHT file before using libsvm.
+
+Table of Contents
+=================
+
+- Quick Start
+- Installation and Data Format
+- `svm-train' Usage
+- `svm-predict' Usage
+- `svm-scale' Usage
+- Tips on Practical Use
+- Examples
+- Precomputed Kernels
+- Library Usage
+- Java Version
+- Building Windows Binaries
+- Additional Tools: Sub-sampling, Parameter Selection, Format checking, etc.
+- Python Interface
+- Additional Information
+
+Quick Start
+===========
+
+If you are new to SVM and if the data is not large, please go to
+`tools' directory and use easy.py after installation. It does
+everything automatic -- from data scaling to parameter selection.
+
+Usage: easy.py training_file [testing_file]
+
+More information about parameter selection can be found in
+`tools/README.'
+
+Installation and Data Format
+============================
+
+On Unix systems, type `make' to build the `svm-train' and `svm-predict'
+programs. Run them without arguments to show the usages of them.
+
+On other systems, consult `Makefile' to build them (e.g., see
+'Building Windows binaries' in this file) or use the pre-built
+binaries (Windows binaries are in the directory `windows').
+
+The format of training and testing data file is:
+
+<label> <index1>:<value1> <index2>:<value2> ...
+.
+.
+.
+
+Each line contains an instance and is ended by a '\n' character. For
+classification, <label> is an integer indicating the class label
+(multi-class is supported). For regression, <label> is the target
+value which can be any real number. For one-class SVM, it's not used
+so can be any number. Except using precomputed kernels (explained in
+another section), <index>:<value> gives a feature (attribute) value.
+<index> is an integer starting from 1 and <value> is a real
+number. Indices must be in ASCENDING order. Labels in the testing
+file are only used to calculate accuracy or errors. If they are
+unknown, just fill the first column with any numbers.
+
+A sample classification data included in this package is
+`heart_scale'. To check if your data is in a correct form, use
+`tools/checkdata.py' (details in `tools/README').
+
+Type `svm-train heart_scale', and the program will read the training
+data and output the model file `heart_scale.model'. If you have a test
+set called heart_scale.t, then type `svm-predict heart_scale.t
+heart_scale.model output' to see the prediction accuracy. The `output'
+file contains the predicted class labels.
+
+There are some other useful programs in this package.
+
+svm-scale:
+
+ This is a tool for scaling input data file.
+
+svm-toy:
+
+ This is a simple graphical interface which shows how SVM
+ separate data in a plane. You can click in the window to
+ draw data points. Use "change" button to choose class
+ 1, 2 or 3 (i.e., up to three classes are supported), "load"
+ button to load data from a file, "save" button to save data to
+ a file, "run" button to obtain an SVM model, and "clear"
+ button to clear the window.
+
+ You can enter options in the bottom of the window, the syntax of
+ options is the same as `svm-train'.
+
+ Note that "load" and "save" consider data in the
+ classification but not the regression case. Each data point
+ has one label (the color) which must be 1, 2, or 3 and two
+ attributes (x-axis and y-axis values) in [0,1].
+
+ Type `make' in respective directories to build them.
+
+ You need Qt library to build the Qt version.
+ (available from http://www.trolltech.com)
+
+ You need GTK+ library to build the GTK version.
+ (available from http://www.gtk.org)
+
+ The pre-built Windows binaries are in the `windows'
+ directory. We use Visual C++ on a 32-bit machine, so the
+ maximal cache size is 2GB.
+
+`svm-train' Usage
+=================
+
+Usage: svm-train [options] training_set_file [model_file]
+options:
+-s svm_type : set type of SVM (default 0)
+ 0 -- C-SVC
+ 1 -- nu-SVC
+ 2 -- one-class SVM
+ 3 -- epsilon-SVR
+ 4 -- nu-SVR
+-t kernel_type : set type of kernel function (default 2)
+ 0 -- linear: u'*v
+ 1 -- polynomial: (gamma*u'*v + coef0)^degree
+ 2 -- radial basis function: exp(-gamma*|u-v|^2)
+ 3 -- sigmoid: tanh(gamma*u'*v + coef0)
+ 4 -- precomputed kernel (kernel values in training_set_file)
+-d degree : set degree in kernel function (default 3)
+-g gamma : set gamma in kernel function (default 1/num_features)
+-r coef0 : set coef0 in kernel function (default 0)
+-c cost : set the parameter C of C-SVC, epsilon-SVR, and nu-SVR (default 1)
+-n nu : set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5)
+-p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1)
+-m cachesize : set cache memory size in MB (default 100)
+-e epsilon : set tolerance of termination criterion (default 0.001)
+-h shrinking : whether to use the shrinking heuristics, 0 or 1 (default 1)
+-b probability_estimates : whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0)
+-wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)
+-v n: n-fold cross validation mode
+-q : quiet mode (no outputs)
+
+
+The k in the -g option means the number of attributes in the input data.
+
+option -v randomly splits the data into n parts and calculates cross
+validation accuracy/mean squared error on them.
+
+See libsvm FAQ for the meaning of outputs.
+
+`svm-predict' Usage
+===================
+
+Usage: svm-predict [options] test_file model_file output_file
+options:
+-b probability_estimates: whether to predict probability estimates, 0 or 1 (default 0); for one-class SVM only 0 is supported
+
+model_file is the model file generated by svm-train.
+test_file is the test data you want to predict.
+svm-predict will produce output in the output_file.
+
+`svm-scale' Usage
+=================
+
+Usage: svm-scale [options] data_filename
+options:
+-l lower : x scaling lower limit (default -1)
+-u upper : x scaling upper limit (default +1)
+-y y_lower y_upper : y scaling limits (default: no y scaling)
+-s save_filename : save scaling parameters to save_filename
+-r restore_filename : restore scaling parameters from restore_filename
+
+See 'Examples' in this file for examples.
+
+Tips on Practical Use
+=====================
+
+* Scale your data. For example, scale each attribute to [0,1] or [-1,+1].
+* For C-SVC, consider using the model selection tool in the tools directory.
+* nu in nu-SVC/one-class-SVM/nu-SVR approximates the fraction of training
+ errors and support vectors.
+* If data for classification are unbalanced (e.g. many positive and
+ few negative), try different penalty parameters C by -wi (see
+ examples below).
+* Specify larger cache size (i.e., larger -m) for huge problems.
+
+Examples
+========
+
+> svm-scale -l -1 -u 1 -s range train > train.scale
+> svm-scale -r range test > test.scale
+
+Scale each feature of the training data to be in [-1,1]. Scaling
+factors are stored in the file range and then used for scaling the
+test data.
+
+> svm-train -s 0 -c 5 -t 2 -g 0.5 -e 0.1 data_file
+
+Train a classifier with RBF kernel exp(-0.5|u-v|^2), C=10, and
+stopping tolerance 0.1.
+
+> svm-train -s 3 -p 0.1 -t 0 data_file
+
+Solve SVM regression with linear kernel u'v and epsilon=0.1
+in the loss function.
+
+> svm-train -c 10 -w1 1 -w-1 5 data_file
+
+Train a classifier with penalty 10 = 1 * 10 for class 1 and penalty 50
+= 5 * 50 for class -1.
+
+> svm-train -s 0 -c 100 -g 0.1 -v 5 data_file
+
+Do five-fold cross validation for the classifier using
+the parameters C = 100 and gamma = 0.1
+
+> svm-train -s 0 -b 1 data_file
+> svm-predict -b 1 test_file data_file.model output_file
+
+Obtain a model with probability information and predict test data with
+probability estimates
+
+Precomputed Kernels
+===================
+
+Users may precompute kernel values and input them as training and
+testing files. Then libsvm does not need the original
+training/testing sets.
+
+Assume there are L training instances x1, ..., xL and.
+Let K(x, y) be the kernel
+value of two instances x and y. The input formats
+are:
+
+New training instance for xi:
+
+<label> 0:i 1:K(xi,x1) ... L:K(xi,xL)
+
+New testing instance for any x:
+
+<label> 0:? 1:K(x,x1) ... L:K(x,xL)
+
+That is, in the training file the first column must be the "ID" of
+xi. In testing, ? can be any value.
+
+All kernel values including ZEROs must be explicitly provided. Any
+permutation or random subsets of the training/testing files are also
+valid (see examples below).
+
+Note: the format is slightly different from the precomputed kernel
+package released in libsvmtools earlier.
+
+Examples:
+
+ Assume the original training data has three four-feature
+ instances and testing data has one instance:
+
+ 15 1:1 2:1 3:1 4:1
+ 45 2:3 4:3
+ 25 3:1
+
+ 15 1:1 3:1
+
+ If the linear kernel is used, we have the following new
+ training/testing sets:
+
+ 15 0:1 1:4 2:6 3:1
+ 45 0:2 1:6 2:18 3:0
+ 25 0:3 1:1 2:0 3:1
+
+ 15 0:? 1:2 2:0 3:1
+
+ ? can be any value.
+
+ Any subset of the above training file is also valid. For example,
+
+ 25 0:3 1:1 2:0 3:1
+ 45 0:2 1:6 2:18 3:0
+
+ implies that the kernel matrix is
+
+ [K(2,2) K(2,3)] = [18 0]
+ [K(3,2) K(3,3)] = [0 1]
+
+Library Usage
+=============
+
+These functions and structures are declared in the header file
+`svm.h'. You need to #include "svm.h" in your C/C++ source files and
+link your program with `svm.cpp'. You can see `svm-train.c' and
+`svm-predict.c' for examples showing how to use them. We define
+LIBSVM_VERSION and declare `extern int libsvm_version; ' in svm.h, so
+you can check the version number.
+
+Before you classify test data, you need to construct an SVM model
+(`svm_model') using training data. A model can also be saved in
+a file for later use. Once an SVM model is available, you can use it
+to classify new data.
+
+- Function: struct svm_model *svm_train(const struct svm_problem *prob,
+ const struct svm_parameter *param);
+
+ This function constructs and returns an SVM model according to
+ the given training data and parameters.
+
+ struct svm_problem describes the problem:
+
+ struct svm_problem
+ {
+ int l;
+ double *y;
+ struct svm_node **x;
+ };
+
+ where `l' is the number of training data, and `y' is an array containing
+ their target values. (integers in classification, real numbers in
+ regression) `x' is an array of pointers, each of which points to a sparse
+ representation (array of svm_node) of one training vector.
+
+ For example, if we have the following training data:
+
+ LABEL ATTR1 ATTR2 ATTR3 ATTR4 ATTR5
+ ----- ----- ----- ----- ----- -----
+ 1 0 0.1 0.2 0 0
+ 2 0 0.1 0.3 -1.2 0
+ 1 0.4 0 0 0 0
+ 2 0 0.1 0 1.4 0.5
+ 3 -0.1 -0.2 0.1 1.1 0.1
+
+ then the components of svm_problem are:
+
+ l = 5
+
+ y -> 1 2 1 2 3
+
+ x -> [ ] -> (2,0.1) (3,0.2) (-1,?)
+ [ ] -> (2,0.1) (3,0.3) (4,-1.2) (-1,?)
+ [ ] -> (1,0.4) (-1,?)
+ [ ] -> (2,0.1) (4,1.4) (5,0.5) (-1,?)
+ [ ] -> (1,-0.1) (2,-0.2) (3,0.1) (4,1.1) (5,0.1) (-1,?)
+
+ where (index,value) is stored in the structure `svm_node':
+
+ struct svm_node
+ {
+ int index;
+ double value;
+ };
+
+ index = -1 indicates the end of one vector. Note that indices must
+ be in ASCENDING order.
+
+ struct svm_parameter describes the parameters of an SVM model:
+
+ struct svm_parameter
+ {
+ int svm_type;
+ int kernel_type;
+ int degree; /* for poly */
+ double gamma; /* for poly/rbf/sigmoid */
+ double coef0; /* for poly/sigmoid */
+
+ /* these are for training only */
+ double cache_size; /* in MB */
+ double eps; /* stopping criteria */
+ double C; /* for C_SVC, EPSILON_SVR, and NU_SVR */
+ int nr_weight; /* for C_SVC */
+ int *weight_label; /* for C_SVC */
+ double* weight; /* for C_SVC */
+ double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
+ double p; /* for EPSILON_SVR */
+ int shrinking; /* use the shrinking heuristics */
+ int probability; /* do probability estimates */
+ };
+
+ svm_type can be one of C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR.
+
+ C_SVC: C-SVM classification
+ NU_SVC: nu-SVM classification
+ ONE_CLASS: one-class-SVM
+ EPSILON_SVR: epsilon-SVM regression
+ NU_SVR: nu-SVM regression
+
+ kernel_type can be one of LINEAR, POLY, RBF, SIGMOID.
+
+ LINEAR: u'*v
+ POLY: (gamma*u'*v + coef0)^degree
+ RBF: exp(-gamma*|u-v|^2)
+ SIGMOID: tanh(gamma*u'*v + coef0)
+ PRECOMPUTED: kernel values in training_set_file
+
+ cache_size is the size of the kernel cache, specified in megabytes.
+ C is the cost of constraints violation.
+ eps is the stopping criterion. (we usually use 0.00001 in nu-SVC,
+ 0.001 in others). nu is the parameter in nu-SVM, nu-SVR, and
+ one-class-SVM. p is the epsilon in epsilon-insensitive loss function
+ of epsilon-SVM regression. shrinking = 1 means shrinking is conducted;
+ = 0 otherwise. probability = 1 means model with probability
+ information is obtained; = 0 otherwise.
+
+ nr_weight, weight_label, and weight are used to change the penalty
+ for some classes (If the weight for a class is not changed, it is
+ set to 1). This is useful for training classifier using unbalanced
+ input data or with asymmetric misclassification cost.
+
+ nr_weight is the number of elements in the array weight_label and
+ weight. Each weight[i] corresponds to weight_label[i], meaning that
+ the penalty of class weight_label[i] is scaled by a factor of weight[i].
+
+ If you do not want to change penalty for any of the classes,
+ just set nr_weight to 0.
+
+ *NOTE* Because svm_model contains pointers to svm_problem, you can
+ not free the memory used by svm_problem if you are still using the
+ svm_model produced by svm_train().
+
+ *NOTE* To avoid wrong parameters, svm_check_parameter() should be
+ called before svm_train().
+
+- Function: double svm_predict(const struct svm_model *model,
+ const struct svm_node *x);
+
+ This function does classification or regression on a test vector x
+ given a model.
+
+ For a classification model, the predicted class for x is returned.
+ For a regression model, the function value of x calculated using
+ the model is returned. For an one-class model, +1 or -1 is
+ returned.
+
+- Function: void svm_cross_validation(const struct svm_problem *prob,
+ const struct svm_parameter *param, int nr_fold, double *target);
+
+ This function conducts cross validation. Data are separated to
+ nr_fold folds. Under given parameters, sequentially each fold is
+ validated using the model from training the remaining. Predicted
+ labels (of all prob's instances) in the validation process are
+ stored in the array called target.
+
+ The format of svm_prob is same as that for svm_train().
+
+- Function: int svm_get_svm_type(const struct svm_model *model);
+
+ This function gives svm_type of the model. Possible values of
+ svm_type are defined in svm.h.
+
+- Function: int svm_get_nr_class(const svm_model *model);
+
+ For a classification model, this function gives the number of
+ classes. For a regression or an one-class model, 2 is returned.
+
+- Function: void svm_get_labels(const svm_model *model, int* label)
+
+ For a classification model, this function outputs the name of
+ labels into an array called label. For regression and one-class
+ models, label is unchanged.
+
+- Function: double svm_get_svr_probability(const struct svm_model *model);
+
+ For a regression model with probability information, this function
+ outputs a value sigma > 0. For test data, we consider the
+ probability model: target value = predicted value + z, z: Laplace
+ distribution e^(-|z|/sigma)/(2sigma)
+
+ If the model is not for svr or does not contain required
+ information, 0 is returned.
+
+- Function: void svm_predict_values(const svm_model *model,
+ const svm_node *x, double* dec_values)
+
+ This function gives decision values on a test vector x given a
+ model.
+
+ For a classification model with nr_class classes, this function
+ gives nr_class*(nr_class-1)/2 decision values in the array
+ dec_values, where nr_class can be obtained from the function
+ svm_get_nr_class. The order is label[0] vs. label[1], ...,
+ label[0] vs. label[nr_class-1], label[1] vs. label[2], ...,
+ label[nr_class-2] vs. label[nr_class-1], where label can be
+ obtained from the function svm_get_labels.
+
+ For a regression model, label[0] is the function value of x
+ calculated using the model. For one-class model, label[0] is +1 or
+ -1.
+
+- Function: double svm_predict_probability(const struct svm_model *model,
+ const struct svm_node *x, double* prob_estimates);
+
+ This function does classification or regression on a test vector x
+ given a model with probability information.
+
+ For a classification model with probability information, this
+ function gives nr_class probability estimates in the array
+ prob_estimates. nr_class can be obtained from the function
+ svm_get_nr_class. The class with the highest probability is
+ returned. For regression/one-class SVM, the array prob_estimates
+ is unchanged and the returned value is the same as that of
+ svm_predict.
+
+- Function: const char *svm_check_parameter(const struct svm_problem *prob,
+ const struct svm_parameter *param);
+
+ This function checks whether the parameters are within the feasible
+ range of the problem. This function should be called before calling
+ svm_train() and svm_cross_validation(). It returns NULL if the
+ parameters are feasible, otherwise an error message is returned.
+
+- Function: int svm_check_probability_model(const struct svm_model *model);
+
+ This function checks whether the model contains required
+ information to do probability estimates. If so, it returns
+ +1. Otherwise, 0 is returned. This function should be called
+ before calling svm_get_svr_probability and
+ svm_predict_probability.
+
+- Function: int svm_save_model(const char *model_file_name,
+ const struct svm_model *model);
+
+ This function saves a model to a file; returns 0 on success, or -1
+ if an error occurs.
+
+- Function: struct svm_model *svm_load_model(const char *model_file_name);
+
+ This function returns a pointer to the model read from the file,
+ or a null pointer if the model could not be loaded.
+
+- Function: void svm_destroy_model(struct svm_model *model);
+
+ This function frees the memory used by a model.
+
+- Function: void svm_destroy_param(struct svm_parameter *param);
+
+ This function frees the memory used by a parameter set.
+
+- Variable: extern void (*svm_print_string) (const char *);
+
+ Users can specify their output format by
+ svm_print_string = &your_print_function;
+
+Java Version
+============
+
+The pre-compiled java class archive `libsvm.jar' and its source files are
+in the java directory. To run the programs, use
+
+java -classpath libsvm.jar svm_train <arguments>
+java -classpath libsvm.jar svm_predict <arguments>
+java -classpath libsvm.jar svm_toy
+java -classpath libsvm.jar svm_scale <arguments>
+
+Note that you need Java 1.5 (5.0) or above to run it.
+
+You may need to add Java runtime library (like classes.zip) to the classpath.
+You may need to increase maximum Java heap size.
+
+Library usages are similar to the C version. These functions are available:
+
+public class svm {
+ public static final int LIBSVM_VERSION=290;
+ public static svm_print_interface svm_print_string;
+ public static svm_model svm_train(svm_problem prob, svm_parameter param);
+ public static void svm_cross_validation(svm_problem prob, svm_parameter param, int nr_fold, double[] target);
+ public static int svm_get_svm_type(svm_model model);
+ public static int svm_get_nr_class(svm_model model);
+ public static void svm_get_labels(svm_model model, int[] label);
+ public static double svm_get_svr_probability(svm_model model);
+ public static void svm_predict_values(svm_model model, svm_node[] x, double[] dec_values);
+ public static double svm_predict(svm_model model, svm_node[] x);
+ public static double svm_predict_probability(svm_model model, svm_node[] x, double[] prob_estimates);
+ public static void svm_save_model(String model_file_name, svm_model model) throws IOException
+ public static svm_model svm_load_model(String model_file_name) throws IOException
+ public static String svm_check_parameter(svm_problem prob, svm_parameter param);
+ public static int svm_check_probability_model(svm_model model);
+}
+
+The library is in the "libsvm" package.
+Note that in Java version, svm_node[] is not ended with a node whose index = -1.
+
+Users can specify their output format by
+
+ svm.svm_print_string = new svm_print_interface()
+ {
+ public void print(String s)
+ {
+ // your own format
+ }
+ };
+
+Building Windows Binaries
+=========================
+
+Windows binaries are in the directory `windows'. To build them via
+Visual C++, use the following steps:
+
+1. Open a DOS command box (or Visual Studio Command Prompt) and change
+to libsvm directory. If environment variables of VC++ have not been
+set, type
+
+"C:\Program Files\Microsoft Visual Studio 8\VC\bin\vcvars32.bat"
+
+You may have to modify the above according which version of VC++ or
+where it is installed.
+
+2. Type
+
+nmake -f Makefile.win clean all
+
+3. (optional) To build python interface, download and install Python.
+Edit Makefile.win and change PYTHON_INC and PYTHON_LIB to your python
+installation. Type
+
+nmake -f Makefile.win python
+
+and then copy windows\python\svmc.pyd to the python directory.
+
+Another way is to build them from Visual C++ environment. See details
+in libsvm FAQ.
+
+- Additional Tools: Sub-sampling, Parameter Selection, Format checking, etc.
+============================================================================
+
+See the README file in the tools directory.
+
+Python Interface
+================
+
+See the README file in python directory.
+
+Additional Information
+======================
+
+If you find LIBSVM helpful, please cite it as
+
+Chih-Chung Chang and Chih-Jen Lin, LIBSVM: a library for
+support vector machines, 2001.
+Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+
+LIBSVM implementation document is available at
+http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf
+
+For any questions and comments, please email cjlin@csie.ntu.edu.tw
+
+Acknowledgments:
+This work was supported in part by the National Science
+Council of Taiwan via the grant NSC 89-2213-E-002-013.
+The authors thank their group members and users
+for many helpful discussions and comments. They are listed in
+http://www.csie.ntu.edu.tw/~cjlin/libsvm/acknowledgements
+
72 README.plus
@@ -0,0 +1,72 @@
+Libsvm Plus is a straightforward improvement of the official
+Libsvm library (http://www.csie.ntu.edu.tw/~cjlin/libsvm).
+
+Author: Vincenzo Russo (http://neminis.org)
+Download: http://neminis.org/software/libsvm-plus
+Version: 2.90
+
+What are the differences?
+=========================
+
+1. Only C++ code supported and mantained, due to the lack of enough time.
+ No Java code provided. Other language interfaces (like Python, etc.)
+ should be work but only providing the original features of LIBSVM.
+ Anyway, no tests were made.
+
+2. Only Unix: for the same reason stated above, I only test on Linux and
+ Mac OS X, which makes LIBSVM Plus likely to work also on other modern Unix
+ systems. Anyway, you could try to use the Makefile.win included in official
+ LIBSVM package to compile and test LIBSVM Plus on Windows platforms;
+
+3. Four additional kernels: Stump, Perceptron, Laplacian, Exponential.
+ Such kernels might be called "infinite ensemble kernels" because a nonlinear
+ SVM which uses them corresponds to a infinite ensemble classifier.
+ Look at the publications of Hsuan-Tien Lin for more theoretical explanations:
+
+ http://www.work.caltech.edu/~htlin/publication/
+
+ The code for realizing the above kernels was back-ported from his LIBSVM fork
+
+ http://www.work.caltech.edu/~htlin/program/libsvm/#infensemble
+
+ based on the older 2.8 version;
+
+4. Three additional SVM models: Classification (C-SVM) via L2SVM,
+ Support Vector Domain Description (SVDD) via L1SVM and via L2SVM.
+ The code was back-ported from a LIBSVM tool:
+
+ Calculating the radius of the smallest sphere containing all training data.
+ http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/#18
+
+ The SVDD can be used as One Class SVM alternative. More theoretical explanations
+ about SVDD can be found in David J. Tax PhD thesis and other papers
+
+ http://www-ict.ewi.tudelft.nl/~davidt/papers.html
+
+
+Minor changes
+=============
+
+Some additional comments to the source code are provided and some C structures
+(svm_model and decision_function) were moved from the svm.cpp to svm.h to allow
+third part softwares to access them in a easier way. Moreover, the svm_model
+structure now provides three new members: SV_idx (indices of the SVs in the
+original dataset), BSV_idx (indices of the BSVs in the original dataset) and
+lbsv (the number of BSVs). Finally, the enumeration element RBF (which in the
+original LIBSVM refers to the Gaussian kernel) was renamed GAUSSIAN, because
+there are several kernels which belong to the RBF class, not only the Gaussian one.
+
+
+License
+=======
+
+For this first release of the LIBSVM Plus we choose to use the same licensing
+of the original LIBSVM library.
+
+
+Version number
+==============
+
+As long as LIBSVM Plus will be a straightforwardly augmented version of the official
+LIBSVM, it will have the same version number of the original LIBSVM code used for
+making the release.
147 py-tools/README
@@ -0,0 +1,147 @@
+This directory includes some useful codes:
+
+1. subset selection tools.
+2. parameter selection tools.
+3. LIBSVM format checking tools
+
+Part I: Subset selection tools
+
+Introduction
+============
+
+Training large data is time consuming. Sometimes one should work on a
+smaller subset first. The python script subset.py randomly selects a
+specified number of samples. For classification data, we provide a
+stratified selection to ensure the same class distribution in the
+subset.
+
+Usage: subset.py [options] dataset number [output1] [output2]
+
+This script selects a subset of the given data set.
+
+options:
+-s method : method of selection (default 0)
+ 0 -- stratified selection (classification only)
+ 1 -- random selection
+
+output1 : the subset (optional)
+output2 : the rest of data (optional)
+
+If output1 is omitted, the subset will be printed on the screen.
+
+Example
+=======
+
+> python subset.py heart_scale 100 file1 file2
+
+From heart_scale 100 samples are randomly selected and stored in
+file1. All remaining instances are stored in file2.
+
+
+Part II: Parameter Selection Tools
+
+Introduction
+============
+
+grid.py is a parameter selection tool for C-SVM classification using
+the RBF (radial basis function) kernel. It uses cross validation (CV)
+technique to estimate the accuracy of each parameter combination in
+the specified range and helps you to decide the best parameters for
+your problem.
+
+grid.py directly executes libsvm binaries (so no python binding is needed)
+for cross validation and then draw contour of CV accuracy using gnuplot.
+You must have libsvm and gnuplot installed before using it. The package
+gnuplot is available at http://www.gnuplot.info/
+
+On Mac OSX, the precompiled gnuplot file needs the library Aquarterm,
+which thus must be installed as well. In addition, this version of
+gnuplot does not support png, so you need to change "set term png
+transparent small" and use other image formats. For example, you may
+have "set term pbm small color".
+
+Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-v fold]
+ [-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
+ [additional parameters for svm-train] dataset
+
+The program conducts v-fold cross validation using parameter C (and gamma)
+= 2^begin, 2^(begin+step), ..., 2^end.
+
+You can specify where the libsvm executable and gnuplot are using the
+-svmtrain and -gnuplot parameters.
+
+For windows users, please use pgnuplot.exe. If you are using gnuplot
+3.7.1, please upgrade to version 3.7.3 or higher. The version 3.7.1
+has a bug. If you use cygwin on windows, please use gunplot-x11.
+
+Example
+=======
+
+> python grid.py -log2c -5,5,1 -log2g -4,0,1 -v 5 -m 300 heart_scale
+
+Users (in particular MS Windows users) may need to specify the path of
+executable files. You can either change paths in the beginning of
+grid.py or specify them in the command line. For example,
+
+> grid.py -log2c -5,5,1 -svmtrain c:\libsvm\windows\svm-train.exe -gnuplot c:\tmp\gnuplot\bin\pgnuplot.exe -v 10 heart_scale
+
+Output: two files
+dataset.png: the CV accuracy contour plot generated by gnuplot
+dataset.out: the CV accuracy at each (log2(C),log2(gamma))
+
+Parallel grid search
+====================
+
+You can conduct a parallel grid search by dispatching jobs to a
+cluster of computers which share the same file system. First, you add
+machine names in grid.py:
+
+ssh_workers = ["linux1", "linux5", "linux5"]
+
+and then setup your ssh so that the authentication works without
+asking a password.
+
+The same machine (e.g., linux5 here) can be listed more than once if
+it has multiple CPUs or has more RAM. If the local machine is the
+best, you can also enlarge the nr_local_worker. For example:
+
+nr_local_worker = 2
+
+Example:
+
+> python grid.py heart_scale
+[local] -1 -1 78.8889 (best c=0.5, g=0.5, rate=78.8889)
+[linux5] -1 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
+[linux5] 5 -1 77.037 (best c=0.5, g=0.0078125, rate=83.3333)
+[linux1] 5 -7 83.3333 (best c=0.5, g=0.0078125, rate=83.3333)
+.
+.
+.
+
+If -log2c, -log2g, or -v is not specified, default values are used.
+
+If your system uses telnet instead of ssh, you list the computer names
+in telnet_workers.
+
+Part III: LIBSVM format checking tools
+
+Introduction
+============
+
+`svm-train' conducts only a simple check of the input data. To do a
+detailed check, we provide a python script `checkdata.py.'
+
+Usage: checkdata.py dataset
+
+This tool is written by Rong-En Fan at National Taiwan University.
+
+Example
+=======
+
+> cat bad_data
+1 3:1 2:4
+> python checkdata.py bad_data
+line 1: feature indices must be in an ascending order, previous/current features 3:1 2:4
+Found 1 lines with error.
+
+
27 py-tools/README.plus
@@ -0,0 +1,27 @@
+This directory includes some useful codes:
+
+1. subset selection tools.
+2. parameter selection tools.
+3. LIBSVM format checking tools
+4. a converter from sparse LIBSVM file format to a classic "dense" file format
+5. a converter from a classic "dense" file format to the sparse LIBSVM file format
+
+The first three tools are inehrited from the official LIBSVM distribution, with
+differences in tools for the parameters selection:
+
+ a. easy.py now accepts a third parameter (numeric, integer) which represent the
+ kernel to use in the process
+ b. grid.py explicit handle the kernel in input by using the switch "-t <kernel_number>"
+ c. both of the above scripts are not subject to the presence of the gnuplot anymore:
+ the scripts test for gnuplot existence and if it is not installed, they simply
+ do not use it.
+
+The tools 4 and 5 were originally developed by Hsuan-Tien Lin and I get them from
+
+ http://www.work.caltech.edu/~htlin/program/libsvm/#dense
+
+and included as they are.
+
+
+--
+Vincenzo Russo
106 py-tools/checkdata.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+#
+# A format checker for LIBSVM
+#
+
+#
+# Copyright (c) 2007, Rong-En Fan
+#
+# All rights reserved.
+#
+# This program is distributed under the same license of the LIBSVM package.
+#
+
+from sys import argv, exit
+import os.path
+
+def err(line_no, msg):
+ print "line %d: %s" % (line_no, msg)
+
+# works like float() but does not accept nan and inf
+def my_float(x):
+ if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
+ raise ValueError
+
+ return float(x)
+
+def main():
+ if len(argv) != 2:
+ print "Usage: %s dataset" % (argv[0])
+ exit(1)
+
+ dataset = argv[1]
+
+ if not os.path.exists(dataset):
+ print "dataset %s not found" % (dataset)
+ exit(1)
+
+ line_no = 1
+ error_line_count = 0
+ for line in open(dataset, 'r'):
+ line_error = False
+
+ # each line must end with a newline character
+ if line[-1] != '\n':
+ err(line_no, "missing a newline character in the end")
+ line_error = True
+
+ nodes = line.split()
+
+ # check label
+ try:
+ label = nodes.pop(0)
+
+ if label.find(',') != -1:
+ # multi-label format
+ try:
+ for l in label.split(','):
+ l = my_float(l)
+ except:
+ err(line_no, "label %s is not a valid multi-label form" % label)
+ line_error = True
+ else:
+ try:
+ label = my_float(label)
+ except:
+ err(line_no, "label %s is not a number" % label)
+ line_error = True
+ except:
+ err(line_no, "missing label, perhaps an empty line?")
+ line_error = True
+
+ # check features
+ prev_index = -1
+ for i in range(len(nodes)):
+ try:
+ (index, value) = nodes[i].split(':')
+
+ index = int(index)
+ value = my_float(value)
+
+ # precomputed kernel's index starts from 0 and LIBSVM
+ # checks it. Hence, don't treat index 0 as an error.
+ if index < 0:
+ err(line_no, "feature index must be positive; wrong feature %s" % nodes[i])
+ line_error = True
+ elif index < prev_index:
+ err(line_no, "feature indices must be in an ascending order, previous/current features %s %s" % (nodes[i-1], nodes[i]))
+ line_error = True
+ prev_index = index
+ except:
+ err(line_no, "feature '%s' not an <index>:<value> pair, <index> integer, <value> real number " % nodes[i])
+ line_error = True
+
+ line_no += 1
+
+ if line_error:
+ error_line_count += 1
+
+ if error_line_count > 0:
+ print("Found %d lines with error." % (error_line_count))
+ else:
+ print("No error.")
+
+main()
+
19 py-tools/dense2sparse.py
@@ -0,0 +1,19 @@
+#!/usr/local/bin/python2.0
+
+import os, sys
+
+from string import *
+
+argv=sys.argv
+argc=len(argv)
+
+raw = map(split, open(argv[1]).readlines())
+
+for line in raw:
+ print line[-1],
+ m=1
+ for token in line[:-1]:
+ if atof(token) != 0:
+ print "%d:%s"%(m,token),
+ m=m+1
+ print
82 py-tools/easy.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+import sys
+import os
+from subprocess import *
+
+if len(sys.argv) <= 1:
+ print('Usage: %s training_file [testing_file]' % sys.argv[0])
+ raise SystemExit
+
+# svm, grid, and gnuplot executable files
+
+is_win32 = (sys.platform == 'win32')
+if not is_win32:
+ svmscale_exe = "../svm-scale"
+ svmtrain_exe = "../svm-train"
+ svmpredict_exe = "../svm-predict"
+ grid_py = "./grid.py"
+ gnuplot_exe = "/usr/bin/gnuplot"
+else:
+ # example for windows
+ svmscale_exe = r"..\windows\svm-scale.exe"
+ svmtrain_exe = r"..\windows\svm-train.exe"
+ svmpredict_exe = r"..\windows\svm-predict.exe"
+ gnuplot_exe = r"c:\tmp\gnuplot\bin\pgnuplot.exe"
+ grid_py = r".\grid.py"
+
+assert os.path.exists(svmscale_exe),"svm-scale executable not found"
+assert os.path.exists(svmtrain_exe),"svm-train executable not found"
+assert os.path.exists(svmpredict_exe),"svm-predict executable not found"
+
+# gnuplot is not necessary for the process
+# assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
+
+assert os.path.exists(grid_py),"grid.py not found"
+
+train_pathname = sys.argv[1]
+assert os.path.exists(train_pathname),"training file not found"
+file_name = os.path.split(train_pathname)[1]
+scaled_file = file_name + ".scale"
+model_file = file_name + ".model"
+range_file = file_name + ".range"
+
+if len(sys.argv) > 2:
+ test_pathname = sys.argv[2]
+ file_name = os.path.split(test_pathname)[1]
+ assert os.path.exists(test_pathname),"testing file not found"
+ scaled_test_file = file_name + ".scale"
+ predict_test_file = file_name + ".predict"
+
+cmd = '%s -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file)
+print 'Scaling training data...'
+call(cmd, shell = True)
+
+cmd = '%s -svmtrain "%s" -gnuplot "%s" "%s"' % (grid_py, svmtrain_exe, gnuplot_exe, scaled_file)
+print('Cross validation...')
+f = Popen(cmd, shell = True, stdout = PIPE).stdout
+
+line = ''
+while True:
+ last_line = line
+ line = f.readline()
+ if not line: break
+c,g,rate = map(float,last_line.split())
+
+print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
+
+cmd = '%s -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file)
+print('Training...')
+Popen(cmd, shell = True, stdout = PIPE).communicate()
+
+print('Output model: %s' % model_file)
+if len(sys.argv) > 2:
+ cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file)
+ print('Scaling testing data...')
+ Popen(cmd, shell = True, stdout = PIPE).communicate()
+
+ cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file)
+ print('Testing...')
+ Popen(cmd, shell = True).communicate()
+
+ print('Output prediction: %s' % predict_test_file)
371 py-tools/grid.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python
+
+
+
+import os, sys, traceback
+import getpass
+from threading import Thread
+from subprocess import *
+
+if(sys.hexversion < 0x03000000):
+ import Queue
+else:
+ import queue as Queue
+
+
+# svmtrain and gnuplot executable
+
+is_win32 = (sys.platform == 'win32')
+if not is_win32:
+ svmtrain_exe = "../svm-train"
+ gnuplot_exe = "/usr/bin/gnuplot"
+else:
+ # example for windows
+ svmtrain_exe = r"..\windows\svm-train.exe"
+ gnuplot_exe = r"c:\tmp\gnuplot\bin\pgnuplot.exe"
+
+# global parameters and their default values
+
+fold = 5
+c_begin, c_end, c_step = -5, 15, 2
+g_begin, g_end, g_step = 3, -15, -2
+global dataset_pathname, dataset_title, pass_through_string
+global out_filename, png_filename
+
+# experimental
+
+telnet_workers = []
+ssh_workers = []
+nr_local_worker = 1
+
+# process command line options, set global parameters
+def process_options(argv=sys.argv):
+
+ global fold
+ global c_begin, c_end, c_step
+ global g_begin, g_end, g_step
+ global dataset_pathname, dataset_title, pass_through_string
+ global svmtrain_exe, gnuplot_exe, gnuplot, out_filename, png_filename
+
+ usage = """\
+Usage: grid.py [-log2c begin,end,step] [-log2g begin,end,step] [-v fold]
+[-svmtrain pathname] [-gnuplot pathname] [-out pathname] [-png pathname]
+[additional parameters for svm-train] dataset"""
+
+ if len(argv) < 2:
+ print(usage)
+ sys.exit(1)
+
+ dataset_pathname = argv[-1]
+ dataset_title = os.path.split(dataset_pathname)[1]
+ out_filename = '%s.out' % dataset_title
+ png_filename = '%s.png' % dataset_title
+ pass_through_options = []
+
+ i = 1
+ while i < len(argv) - 1:
+ if argv[i] == "-log2c":
+ i = i + 1
+ (c_begin,c_end,c_step) = map(float,split(argv[i],","))
+ elif argv[i] == "-log2g":
+ i = i + 1
+ (g_begin,g_end,g_step) = map(float,split(argv[i],","))
+ elif argv[i] == "-v":
+ i = i + 1
+ fold = argv[i]
+ elif argv[i] in ('-c','-g'):
+ print("Option -c and -g are renamed.")
+ print(usage)
+ sys.exit(1)
+ elif argv[i] == '-svmtrain':
+ i = i + 1
+ svmtrain_exe = argv[i]
+ elif argv[i] == '-gnuplot':
+ i = i + 1
+ gnuplot_exe = argv[i]
+ elif argv[i] == '-out':
+ i = i + 1
+ out_filename = argv[i]
+ elif argv[i] == '-png':
+ i = i + 1
+ png_filename = argv[i]
+ else:
+ pass_through_options.append(argv[i])
+ i = i + 1
+
+ pass_through_string = " ".join(pass_through_options)
+ assert os.path.exists(svmtrain_exe),"svm-train executable not found"
+
+ # gnuplot is not necessary for the process
+ #assert os.path.exists(gnuplot_exe),"gnuplot executable not found"
+
+ assert os.path.exists(dataset_pathname),"dataset not found"
+ gnuplot = None
+ if os.path.exists(gnuplot_exe):
+ gnuplot = Popen(gnuplot_exe,stdin = PIPE).stdin
+
+
+def range_f(begin,end,step):
+ # like range, but works on non-integer too
+ seq = []
+ while True:
+ if step > 0 and begin > end: break
+ if step < 0 and begin < end: break
+ seq.append(begin)
+ begin = begin + step
+ return seq
+
+def permute_sequence(seq):
+ n = len(seq)
+ if n <= 1: return seq
+
+ mid = int(n/2)
+ left = permute_sequence(seq[:mid])
+ right = permute_sequence(seq[mid+1:])
+
+ ret = [seq[mid]]
+ while left or right:
+ if left: ret.append(left.pop(0))
+ if right: ret.append(right.pop(0))
+
+ return ret
+
+def redraw(db,best_param,tofile=False):
+ if len(db) == 0: return
+ begin_level = round(max(x[2] for x in db)) - 3
+ step_size = 0.5
+
+ best_log2c,best_log2g,best_rate = best_param
+
+ if tofile:
+ if gnuplot != None:
+ gnuplot.write("set term png transparent small\n")
+ gnuplot.write("set output \"%s\"\n" % png_filename.replace('\\','\\\\'))
+ #gnuplot.write("set term postscript color solid\n")
+ #gnuplot.write("set output \"%s.ps\"\n" % dataset_title)
+ elif is_win32:
+ if gnuplot != None:
+ gnuplot.write("set term windows\n")
+ else:
+ if gnuplot != None:
+ gnuplot.write("set term x11\n")
+ gnuplot.write("set xlabel \"log2(C)\"\n")
+ gnuplot.write("set ylabel \"log2(gamma)\"\n")
+ gnuplot.write("set xrange [%s:%s]\n" % (c_begin,c_end))
+ gnuplot.write("set yrange [%s:%s]\n" % (g_begin,g_end))
+ gnuplot.write("set contour\n")
+ gnuplot.write("set cntrparam levels incremental %s,%s,100\n" % (begin_level,step_size))
+ gnuplot.write("unset surface\n")
+ gnuplot.write("unset ztics\n")
+ gnuplot.write("set view 0,0\n")
+ gnuplot.write("set title \"%s\"\n" % dataset_title)
+ gnuplot.write("unset label\n")
+ gnuplot.write("set label \"Best log2(C) = %s log2(gamma) = %s accuracy = %s%%\" \
+ at screen 0.5,0.85 center\n" % \
+ (best_log2c, best_log2g, best_rate))
+ gnuplot.write("set label \"C = %s gamma = %s\""
+ " at screen 0.5,0.8 center\n" % (2**best_log2c, 2**best_log2g))
+ gnuplot.write("splot \"-\" with lines\n")
+ def cmp (x,y):
+ if x[0] < y[0]: return -1
+ if x[0] > y[0]: return 1
+ if x[1] > y[1]: return -1
+ if x[1] < y[1]: return 1
+ return 0
+ db.sort(cmp)
+ prevc = db[0][0]
+ for line in db:
+ if prevc != line[0]:
+ prevc = line[0]
+ if gnuplot != None:
+ gnuplot.write("\n")
+ if gnuplot != None:
+ gnuplot.write("%s %s %s\n" % line)
+ if gnuplot != None:
+ gnuplot.write("e\n")
+ gnuplot.write("\n") # force gnuplot back to prompt when term set failure
+ gnuplot.flush()
+
+
+def calculate_jobs():
+ c_seq = permute_sequence(range_f(c_begin,c_end,c_step))
+ g_seq = permute_sequence(range_f(g_begin,g_end,g_step))
+ nr_c = float(len(c_seq))
+ nr_g = float(len(g_seq))
+ i = 0
+ j = 0
+ jobs = []
+
+ while i < nr_c or j < nr_g:
+ if i/nr_c < j/nr_g:
+ # increase C resolution
+ line = []
+ for k in range(0,j):
+ line.append((c_seq[i],g_seq[k]))
+ i = i + 1
+ jobs.append(line)
+ else:
+ # increase g resolution
+ line = []
+ for k in range(0,i):
+ line.append((c_seq[k],g_seq[j]))
+ j = j + 1
+ jobs.append(line)
+ return jobs
+
+class WorkerStopToken: # used to notify the worker to stop
+ pass
+
+class Worker(Thread):
+ def __init__(self,name,job_queue,result_queue):
+ Thread.__init__(self)
+ self.name = name
+ self.job_queue = job_queue
+ self.result_queue = result_queue
+ def run(self):
+ while True:
+ (cexp,gexp) = self.job_queue.get()
+ if cexp is WorkerStopToken:
+ self.job_queue.put((cexp,gexp))
+ # print 'worker %s stop.' % self.name
+ break
+ try:
+ rate = self.run_one(2.0**cexp,2.0**gexp)
+ if rate is None: raise "get no rate"
+ except:
+ # we failed, let others do that and we just quit
+
+ traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
+
+ self.job_queue.put((cexp,gexp))
+ print('worker %s quit.' % self.name)
+ break
+ else:
+ self.result_queue.put((self.name,cexp,gexp,rate))
+
+class LocalWorker(Worker):
+ def run_one(self,c,g):
+ cmdline = '%s -c %s -g %s -v %s %s %s' % \
+ (svmtrain_exe,c,g,fold,pass_through_string,dataset_pathname)
+ result = Popen(cmdline,shell=True,stdout=PIPE).stdout
+ for line in result.readlines():
+ if str(line).find("Cross") != -1:
+ return float(line.split()[-1][0:-1])
+
+class SSHWorker(Worker):
+ def __init__(self,name,job_queue,result_queue,host):
+ Worker.__init__(self,name,job_queue,result_queue)
+ self.host = host
+ self.cwd = os.getcwd()
+ def run_one(self,c,g):
+ cmdline = 'ssh -x %s "cd %s; %s -c %s -g %s -v %s %s %s"' % \
+ (self.host,self.cwd,
+ svmtrain_exe,c,g,fold,pass_through_string,dataset_pathname)
+ result = Popen(cmdline,shell=True,stdout=PIPE).stdout
+ for line in result.readlines():
+ if str(line).find("Cross") != -1:
+ return float(line.split()[-1][0:-1])
+
+class TelnetWorker(Worker):
+ def __init__(self,name,job_queue,result_queue,host,username,password):
+ Worker.__init__(self,name,job_queue,result_queue)
+ self.host = host
+ self.username = username
+ self.password = password
+ def run(self):
+ import telnetlib
+ self.tn = tn = telnetlib.Telnet(self.host)
+ tn.read_until("login: ")
+ tn.write(self.username + "\n")
+ tn.read_until("Password: ")
+ tn.write(self.password + "\n")
+
+ # XXX: how to know whether login is successful?
+ tn.read_until(self.username)
+ #
+ print('login ok', self.host)
+ tn.write("cd "+os.getcwd()+"\n")
+ Worker.run(self)
+ tn.write("exit\n")
+ def run_one(self,c,g):
+ cmdline = '%s -c %s -g %s -v %s %s %s' % \
+ (svmtrain_exe,c,g,fold,pass_through_string,dataset_pathname)
+ result = self.tn.write(cmdline+'\n')
+ (idx,matchm,output) = self.tn.expect(['Cross.*\n'])
+ for line in output.split('\n'):
+ if str(line).find("Cross") != -1:
+ return float(line.split()[-1][0:-1])
+
+def main():
+
+ # set parameters
+
+ process_options()
+
+ # put jobs in queue
+
+ jobs = calculate_jobs()
+ job_queue = Queue.Queue(0)
+ result_queue = Queue.Queue(0)
+
+ for line in jobs:
+ for (c,g) in line:
+ job_queue.put((c,g))
+
+ job_queue._put = job_queue.queue.appendleft
+
+
+ # fire telnet workers
+
+ if telnet_workers:
+ nr_telnet_worker = len(telnet_workers)
+ username = getpass.getuser()
+ password = getpass.getpass()
+ for host in telnet_workers:
+ TelnetWorker(host,job_queue,result_queue,
+ host,username,password).start()
+
+ # fire ssh workers
+
+ if ssh_workers:
+ for host in ssh_workers:
+ SSHWorker(host,job_queue,result_queue,host).start()
+
+ # fire local workers
+
+ for i in range(nr_local_worker):
+ LocalWorker('local',job_queue,result_queue).start()
+
+ # gather results
+
+ done_jobs = {}
+
+
+ result_file = open(out_filename, 'w')
+
+
+ db = []
+ best_rate = -1
+ best_c1,best_g1 = None,None
+
+ for line in jobs:
+ for (c,g) in line:
+ while (c, g) not in done_jobs:
+ (worker,c1,g1,rate) = result_queue.get()
+ done_jobs[(c1,g1)] = rate
+ result_file.write('%s %s %s\n' %(c1,g1,rate))
+ result_file.flush()
+ if (rate > best_rate) or (rate==best_rate and g1==best_g1 and c1<best_c1):
+ best_rate = rate
+ best_c1,best_g1=c1,g1
+ best_c = 2.0**c1
+ best_g = 2.0**g1
+ print("[%s] %s %s %s (best c=%s, g=%s, rate=%s)" % \
+ (worker,c1,g1,rate, best_c, best_g, best_rate))
+ db.append((c,g,done_jobs[(c,g)]))
+ redraw(db,[best_c1, best_g1, best_rate])
+ redraw(db,[best_c1, best_g1, best_rate],True)
+
+ job_queue.put((WorkerStopToken,None))
+ print "%s %s %s" % (best_c, best_g, best_rate)
+main()
35 py-tools/sparse2dense.py
@@ -0,0 +1,35 @@
+#!/usr/local/bin/python2.0
+
+import os, sys
+
+from string import *
+
+argv=sys.argv
+argc=len(argv)
+
+raw = map(split, open(argv[1]).readlines())
+
+m=-1
+data=[]
+for line in raw:
+ dline = [line[0]]
+ begin=1
+ for token in line[1:]:
+ both=split(token, ":")
+ next=atoi(both[0])
+ if next>m:
+ m=next
+ for i in range(begin, next):
+ dline.append("0")
+ dline.append(both[1])
+ begin=next+1
+ data.append(dline)
+
+for dline in data:
+ for token in dline[1:]:
+ print token,
+ for i in range(len(dline), m+1):
+ print 0,
+ print dline[0]
+
+
146 py-tools/subset.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+from sys import argv, exit, stdout, stderr
+from random import randint
+
+method = 0
+global n
+global dataset_filename
+subset_filename = ""
+rest_filename = ""
+
+def exit_with_help():
+ print("""\
+Usage: %s [options] dataset number [output1] [output2]
+
+This script selects a subset of the given dataset.
+
+options:
+-s method : method of selection (default 0)
+ 0 -- stratified selection (classification only)
+ 1 -- random selection
+
+output1 : the subset (optional)
+output2 : rest of the data (optional)
+If output1 is omitted, the subset will be printed on the screen.""" % argv[0])
+ exit(1)
+
+def process_options():
+ global method, n
+ global dataset_filename, subset_filename, rest_filename
+
+ argc = len(argv)
+ if argc < 3:
+ exit_with_help()
+
+ i = 1
+ while i < len(argv):
+ if argv[i][0] != "-":
+ break
+ if argv[i] == "-s":
+ i = i + 1
+ method = int(argv[i])
+ if method < 0 or method > 1:
+ print("Unknown selection method %d" % (method))
+ exit_with_help()
+ i = i + 1
+
+ dataset_filename = argv[i]
+ n = int(argv[i+1])
+ if i+2 < argc:
+ subset_filename = argv[i+2]
+ if i+3 < argc:
+ rest_filename = argv[i+3]
+
+def main():
+ class Label:
+ def __init__(self, label, index, selected):
+ self.label = label
+ self.index = index
+ self.selected = selected
+
+ process_options()
+
+ # get labels
+ i = 0
+ labels = []
+ f = open(dataset_filename, 'r')
+ for line in f:
+ labels.append(Label(float((line.split())[0]), i, 0))
+ i = i + 1
+ f.close()
+ l = i
+
+ # determine where to output
+ if subset_filename != "":
+ file1 = open(subset_filename, 'w')
+ else:
+ file1 = stdout
+ split = 0
+ if rest_filename != "":
+ split = 1
+ file2 = open(rest_filename, 'w')
+
+ # select the subset
+ warning = 0
+ if method == 0: # stratified
+ labels.sort(key = lambda x: x.label)
+
+ label_end = labels[l-1].label + 1
+ labels.append(Label(label_end, l, 0))
+
+ begin = 0
+ label = labels[begin].label
+ for i in range(l+1):
+ new_label = labels[i].label
+ if new_label != label:
+ nr_class = i - begin
+ k = i*n//l - begin*n//l
+ # at least one instance per class
+ if k == 0:
+ k = 1
+ warning = warning + 1
+ for j in range(nr_class):
+ if randint(0, nr_class-j-1) < k:
+ labels[begin+j].selected = 1
+ k = k - 1
+ begin = i
+ label = new_label
+ elif method == 1: # random
+ k = n
+ for i in range(l):
+ if randint(0,l-i-1) < k:
+ labels[i].selected = 1
+ k = k - 1
+ i = i + 1
+
+ # output
+ i = 0
+ if method == 0:
+ labels.sort(key = lambda x: int(x.index))
+
+ f = open(dataset_filename, 'r')
+ for line in f:
+ if labels[i].selected == 1:
+ file1.write(line)
+ else:
+ if split == 1:
+ file2.write(line)
+ i = i + 1
+
+ if warning > 0:
+ stderr.write("""\
+Warning:
+1. You may have regression data. Please use -s 1.
+2. Classification data unbalanced or too small. We select at least 1 per class.
+ The subset thus contains %d instances.
+""" % (n+warning))
+
+ # cleanup
+ f.close()
+
+ file1.close()
+
+ if split == 1:
+ file2.close()
+
+main()
226 svm-predict.c
@@ -0,0 +1,226 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "svm.h"
+
+struct svm_node *x;
+int max_nr_attr = 64;
+
+struct svm_model* model;
+int predict_probability=0;
+
+static char *line = NULL;
+static int max_line_len;
+
+static char* readline(FILE *input)
+{
+ int len;
+
+ if(fgets(line,max_line_len,input) == NULL)
+ return NULL;
+
+ while(strrchr(line,'\n') == NULL)
+ {
+ max_line_len *= 2;
+ line = (char *) realloc(line,max_line_len);
+ len = (int) strlen(line);
+ if(fgets(line+len,max_line_len-len,input) == NULL)
+ break;
+ }
+ return line;
+}
+
+void exit_input_error(int line_num)
+{
+ fprintf(stderr,"Wrong input format at line %d\n", line_num);
+ exit(1);
+}
+
+void predict(FILE *input, FILE *output)
+{
+ int correct = 0;
+ int total = 0;
+ double error = 0;
+ double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;
+
+ int svm_type=svm_get_svm_type(model);
+ int nr_class=svm_get_nr_class(model);
+ double *prob_estimates=NULL;
+ int j;
+
+ if(predict_probability)
+ {
+ if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
+ printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model));
+ else
+ {
+ int *labels=(int *) malloc(nr_class*sizeof(int));
+ svm_get_labels(model,labels);
+ prob_estimates = (double *) malloc(nr_class*sizeof(double));
+ fprintf(output,"labels");
+ for(j=0;j<nr_class;j++)
+ fprintf(output," %d",labels[j]);
+ fprintf(output,"\n");
+ free(labels);
+ }
+ }
+
+ max_line_len = 1024;
+ line = (char *)malloc(max_line_len*sizeof(char));
+ while(readline(input) != NULL)
+ {
+ int i = 0;
+ double target_label, predict_label;
+ char *idx, *val, *label, *endptr;
+ int inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
+
+ label = strtok(line," \t");
+ target_label = strtod(label,&endptr);
+ if(endptr == label)
+ exit_input_error(total+1);
+
+ while(1)
+ {
+ if(i>=max_nr_attr-1) // need one more for index = -1
+ {
+ max_nr_attr *= 2;
+ x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
+ }
+
+ idx = strtok(NULL,":");
+ val = strtok(NULL," \t");
+
+ if(val == NULL)
+ break;
+ errno = 0;
+ x[i].index = (int) strtol(idx,&endptr,10);
+ if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
+ exit_input_error(total+1);
+ else
+ inst_max_index = x[i].index;
+
+ errno = 0;
+ x[i].value = strtod(val,&endptr);
+ if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
+ exit_input_error(total+1);
+
+ ++i;
+ }
+ x[i].index = -1;
+
+ if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC))
+ {
+ predict_label = svm_predict_probability(model,x,prob_estimates);
+ fprintf(output,"%g",predict_label);
+ for(j=0;j<nr_class;j++)
+ fprintf(output," %g",prob_estimates[j]);
+ fprintf(output,"\n");
+ }
+ else
+ {
+ predict_label = svm_predict(model,x);
+ fprintf(output,"%g\n",predict_label);
+ }
+
+ if(predict_label == target_label)
+ ++correct;
+ error += (predict_label-target_label)*(predict_label-target_label);
+ sump += predict_label;
+ sumt += target_label;
+ sumpp += predict_label*predict_label;
+ sumtt += target_label*target_label;
+ sumpt += predict_label*target_label;
+ ++total;
+ }
+ if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
+ {
+ printf("Mean squared error = %g (regression)\n",error/total);
+ printf("Squared correlation coefficient = %g (regression)\n",
+ ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
+ ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
+ );
+ }
+ else
+ printf("Accuracy = %g%% (%d/%d) (classification)\n",
+ (double)correct/total*100,correct,total);
+ if(predict_probability)
+ free(prob_estimates);
+}
+
+void exit_with_help()
+{
+ printf(
+ "Usage: svm-predict [options] test_file model_file output_file\n"
+ "options:\n"
+ "-b probability_estimates: whether to predict probability estimates, 0 or 1 (default 0); for one-class SVM only 0 is supported\n"
+ );
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ FILE *input, *output;
+ int i;
+
+ // parse options
+ for(i=1;i<argc;i++)
+ {
+ if(argv[i][0] != '-') break;
+ ++i;
+ switch(argv[i-1][1])
+ {
+ case 'b':
+ predict_probability = atoi(argv[i]);
+ break;
+ default:
+ fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]);
+ exit_with_help();
+ }
+ }
+ if(i>=argc-2)
+ exit_with_help();
+
+ input = fopen(argv[i],"r");
+ if(input == NULL)
+ {
+ fprintf(stderr,"can't open input file %s\n",argv[i]);
+ exit(1);
+ }
+
+ output = fopen(argv[i+2],"w");
+ if(output == NULL)
+ {
+ fprintf(stderr,"can't open output file %s\n",argv[i+2]);
+ exit(1);
+ }
+
+ if((model=svm_load_model(argv[i+1]))==0)
+ {
+ fprintf(stderr,"can't open model file %s\n",argv[i+1]);
+ exit(1);
+ }
+
+ x = (struct svm_node *) malloc(max_nr_attr*sizeof(struct svm_node));
+ if(predict_probability)
+ {
+ if(svm_check_probability_model(model)==0)
+ {
+ fprintf(stderr,"Model does not support probabiliy estimates\n");
+ exit(1);
+ }
+ }
+ else
+ {
+ if(svm_check_probability_model(model)!=0)
+ printf("Model supports probability estimates, but disabled in prediction.\n");
+ }
+ predict(input,output);
+ svm_destroy_model(model);
+ free(x);
+ free(line);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
353 svm-scale.c
@@ -0,0 +1,353 @@
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+void exit_with_help()
+{
+ printf(
+ "Usage: svm-scale [options] data_filename\n"
+ "options:\n"
+ "-l lower : x scaling lower limit (default -1)\n"
+ "-u upper : x scaling upper limit (default +1)\n"
+ "-y y_lower y_upper : y scaling limits (default: no y scaling)\n"
+ "-s save_filename : save scaling parameters to save_filename\n"
+ "-r restore_filename : restore scaling parameters from restore_filename\n"
+ );
+ exit(1);
+}
+
+char *line = NULL;
+int max_line_len = 1024;
+double lower=-1.0,upper=1.0,y_lower,y_upper;
+int y_scaling = 0;
+double *feature_max;
+double *feature_min;
+double y_max = -DBL_MAX;
+double y_min = DBL_MAX;
+int max_index;
+long int num_nonzeros = 0;
+long int new_num_nonzeros = 0;
+
+#define max(x,y) (((x)>(y))?(x):(y))
+#define min(x,y) (((x)<(y))?(x):(y))
+
+void output_target(double value);
+void output(int index, double value);
+char* readline(FILE *input);
+
+int main(int argc,char **argv)
+{
+ int i,index;
+ FILE *fp, *fp_restore = NULL;
+ char *save_filename = NULL;
+ char *restore_filename = NULL;
+
+ for(i=1;i<argc;i++)
+ {
+ if(argv[i][0] != '-') break;
+ ++i;
+ switch(argv[i-1][1])
+ {
+ case 'l': lower = atof(argv[i]); break;
+ case 'u': upper = atof(argv[i]); break;
+ case 'y':
+ y_lower = atof(argv[i]);
+ ++i;
+ y_upper = atof(argv[i]);
+ y_scaling = 1;
+ break;
+ case 's': save_filename = argv[i]; break;
+ case 'r': restore_filename = argv[i]; break;
+ default:
+ fprintf(stderr,"unknown option\n");
+ exit_with_help();
+ }
+ }
+
+ if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))
+ {
+ fprintf(stderr,"inconsistent lower/upper specification\n");
+ exit(1);
+ }
+
+ if(restore_filename && save_filename)
+ {
+ fprintf(stderr,"cannot use -r and -s simultaneously\n");
+ exit(1);
+ }
+
+ if(argc != i+1)
+ exit_with_help();
+
+ fp=fopen(argv[i],"r");
+
+ if(fp==NULL)
+ {
+ fprintf(stderr,"can't open file %s\n", argv[i]);
+ exit(1);
+ }
+
+ line = (char *) malloc(max_line_len*sizeof(char));
+
+#define SKIP_TARGET\
+ while(isspace(*p)) ++p;\
+ while(!isspace(*p)) ++p;
+
+#define SKIP_ELEMENT\
+ while(*p!=':') ++p;\
+ ++p;\
+ while(isspace(*p)) ++p;\
+ while(*p && !isspace(*p)) ++p;
+
+ /* assumption: min index of attributes is 1 */
+ /* pass 1: find out max index of attributes */
+ max_index = 0;
+
+ if(restore_filename)
+ {
+ int idx, c;
+
+ fp_restore = fopen(restore_filename,"r");
+ if(fp_restore==NULL)
+ {
+ fprintf(stderr,"can't open file %s\n", restore_filename);
+ exit(1);
+ }
+
+ c = fgetc(fp_restore);
+ if(c == 'y')
+ {
+ readline(fp_restore);
+ readline(fp_restore);
+ readline(fp_restore);
+ }
+ readline(fp_restore);
+ readline(fp_restore);
+
+ while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
+ max_index = max(idx,max_index);
+ rewind(fp_restore);
+ }
+
+ while(readline(fp)!=NULL)
+ {
+ char *p=line;
+
+ SKIP_TARGET
+
+ while(sscanf(p,"%d:%*f",&index)==1)
+ {
+ max_index = max(max_index, index);
+ SKIP_ELEMENT
+ num_nonzeros++;
+ }
+ }
+ rewind(fp);
+
+ feature_max = (double *)malloc((max_index+1)* sizeof(double));
+ feature_min = (double *)malloc((max_index+1)* sizeof(double));
+
+ if(feature_max == NULL || feature_min == NULL)
+ {
+ fprintf(stderr,"can't allocate enough memory\n");
+ exit(1);
+ }
+
+ for(i=0;i<=max_index;i++)
+ {
+ feature_max[i]=-DBL_MAX;
+ feature_min[i]=DBL_MAX;
+ }
+
+ /* pass 2: find out min/max value */
+ while(readline(fp)!=NULL)
+ {
+ char *p=line;
+ int next_index=1;
+ double target;
+ double value;
+
+ sscanf(p,"%lf",&target);
+ y_max = max(y_max,target);
+ y_min = min(y_min,target);
+
+ SKIP_TARGET
+
+ while(sscanf(p,"%d:%lf",&index,&value)==2)
+ {
+ for(i=next_index;i<index;i++)
+ {
+ feature_max[i]=max(feature_max[i],0);
+ feature_min[i]=min(feature_min[i],0);
+ }
+
+ feature_max[index]=max(feature_max[index],value);
+ feature_min[index]=min(feature_min[index],value);
+
+ SKIP_ELEMENT
+ next_index=index+1;
+ }
+
+ for(i=next_index;i<=max_index;i++)
+ {
+ feature_max[i]=max(feature_max[i],0);
+ feature_min[i]=min(feature_min[i],0);
+ }
+ }
+
+ rewind(fp);
+
+ /* pass 2.5: save/restore feature_min/feature_max */
+
+ if(restore_filename)
+ {
+ /* fp_restore rewinded in finding max_index */
+ int idx, c;
+ double fmin, fmax;
+
+ if((c = fgetc(fp_restore)) == 'y')
+ {
+ fscanf(fp_restore, "%lf %lf\n", &y_lower, &y_upper);
+ fscanf(fp_restore, "%lf %lf\n", &y_min, &y_max);
+ y_scaling = 1;
+ }
+ else
+ ungetc(c, fp_restore);
+
+ if (fgetc(fp_restore) == 'x') {
+ fscanf(fp_restore, "%lf %lf\n", &lower, &upper);
+ while(fscanf(fp_restore,"%d %lf %lf\n",&idx,&fmin,&fmax)==3)
+ {
+ if(idx<=max_index)
+ {
+ feature_min[idx] = fmin;
+ feature_max[idx] = fmax;
+ }
+ }
+ }
+ fclose(fp_restore);
+ }
+
+ if(save_filename)
+ {
+ FILE *fp_save = fopen(save_filename,"w");
+ if(fp_save==NULL)
+ {
+ fprintf(stderr,"can't open file %s\n", save_filename);
+ exit(1);
+ }
+ if(y_scaling)
+ {
+ fprintf(fp_save, "y\n");
+ fprintf(fp_save, "%.16g %.16g\n", y_lower, y_upper);
+ fprintf(fp_save, "%.16g %.16g\n", y_min, y_max);
+ }
+ fprintf(fp_save, "x\n");
+ fprintf(fp_save, "%.16g %.16g\n", lower, upper);
+ for(i=1;i<=max_index;i++)
+ {
+ if(feature_min[i]!=feature_max[i])
+ fprintf(fp_save,"%d %.16g %.16g\n",i,feature_min[i],feature_max[i]);
+ }
+ fclose(fp_save);
+ }
+
+ /* pass 3: scale */
+ while(readline(fp)!=NULL)
+ {
+ char *p=line;
+ int next_index=1;
+ double target;
+ double value;
+
+ sscanf(p,"%lf",&target);
+ output_target(target);
+
+ SKIP_TARGET
+
+ while(sscanf(p,"%d:%lf",&index,&value)==2)
+ {
+ for(i=next_index;i<index;i++)
+ output(i,0);
+
+ output(index,value);
+
+ SKIP_ELEMENT
+ next_index=index+1;
+ }
+
+ for(i=next_index;i<=max_index;i++)
+ output(i,0);
+
+ printf("\n");
+ }
+
+ if (new_num_nonzeros > num_nonzeros)
+ fprintf(stderr,
+ "Warning: original #nonzeros %ld\n"
+ " new #nonzeros %ld\n"
+ "Use -l 0 if many original feature values are zeros\n",
+ num_nonzeros, new_num_nonzeros);
+
+ free(line);
+ free(feature_max);
+ free(feature_min);
+ fclose(fp);
+ return 0;
+}
+
+char* readline(FILE *input)
+{
+ int len;
+
+ if(fgets(line,max_line_len,input) == NULL)
+ return NULL;
+
+ while(strrchr(line,'\n') == NULL)
+ {
+ max_line_len *= 2;
+ line = (char *) realloc(line, max_line_len);
+ len = (int) strlen(line);
+ if(fgets(line+len,max_line_len-len,input) == NULL)
+ break;
+ }
+ return line;
+}
+
+void output_target(double value)
+{
+ if(y_scaling)
+ {
+ if(value == y_min)
+ value = y_lower;
+ else if(value == y_max)
+ value = y_upper;
+ else value = y_lower + (y_upper-y_lower) *
+ (value - y_min)/(y_max-y_min);
+ }
+ printf("%g ",value);
+}
+
+void output(int index, double value)
+{
+ /* skip single-valued attribute */
+ if(feature_max[index] == feature_min[index])
+ return;
+
+ if(value == feature_min[index])
+ value = lower;
+ else if(value == feature_max[index])
+ value = upper;
+ else
+ value = lower + (upper-lower) *
+ (value-feature_min[index])/
+ (feature_max[index]-feature_min[index]);
+
+ if(value != 0)
+ {
+ printf("%d:%g ",index, value);
+ new_num_nonzeros++;
+ }
+}
378 svm-train.c
@@ -0,0 +1,378 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include "svm.h"
+#define Malloc(type,n) (type *)malloc((n)*sizeof(type))
+
+void print_null(const char *s) {}
+
+void exit_with_help()
+{
+ printf(
+ "Usage: svm-train [options] training_set_file [model_file]\n"
+ "options:\n"
+ "-s svm_type : set type of SVM (default 0)\n"
+ "\t%d -- C-SVC (L1SVM)\n"
+ "\t%d -- C-SVC (L2SVM)\n"
+ "\t%d -- nu-SVC\n"
+ "\t%d -- one-class SVM\n"
+ "\t%d -- epsilon-SVR\n"
+ "\t%d -- nu-SVR\n"
+ "\t%d -- SVDD (L1SVM)\n"
+ "\t%d -- SVDD (L2SVM)\n"
+ "-t kernel_type : set type of kernel function (default 2)\n"
+ "\t%d -- linear: u'*v\n"
+ "\t%d -- polynomial: (gamma*u'*v + coef0)^degree\n"
+ "\t%d -- gaussian: exp(-gamma*||u-v||^2)\n"
+ "\t%d -- sigmoid: tanh(gamma*u'*v + coef0)\n"
+ "\t%d -- stump: -|u-v| + coef0\n"
+ "\t%d -- perceptron: -||u-v|| + coef0\n"
+ "\t%d -- laplacian: exp(-gamma*|u-v|)\n"
+ "\t%d -- exponential: exp(-gamma*||u-v||)\n"
+ "\t%d -- precomputed kernel (kernel values in training_set_file)\n"
+ "-d degree: set degree in kernel function (default 3)\n"
+ "-g gamma: set gamma in kernel function (default 1/num_features)\n"
+ "-r coef0: set coef0 in kernel function (default 0)\n"
+ "-c cost: set the parameter C of C-SVC (L1/L2-SVM), SVDD (L1/L2-SVM), epsilon-SVR, and nu-SVR (default 1)\n"
+ "-n nu: set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5)\n"
+ "-p epsilon: set the epsilon in loss function of epsilon-SVR (default 0.1)\n"
+ "-m cachesize: set cache memory size in MB (default 100)\n"
+ "-e epsilon: set tolerance of termination criterion (default 0.001)\n"
+ "-h shrinking: whether to use the shrinking heuristics, 0 or 1 (default 1)\n"
+ "-b probability_estimates: whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0)\n"
+ "-wi weight: set the parameter C of class i to weight*C, for C-SVC (default 1)\n"
+ "-v n: n-fold cross validation mode\n"
+ "-q: quiet mode (no outputs)\n", C_SVC, C_SVC_L2, NU_SVC, ONE_CLASS,
+ EPSILON_SVR, NU_SVR, SVDD, SVDD_L2,
+ LINEAR, POLY, GAUSSIAN, SIGMOID, STUMP,
+ PERC, LAPLACE, EXPO, PRECOMPUTED);
+ exit(1);
+}
+
+void exit_input_error(int line_num)
+{
+ fprintf(stderr,"Wrong input format at line %d\n", line_num);
+ exit(1);
+}
+
+void parse_command_line(int argc, char **argv, char *input_file_name, char *model_file_name);
+void read_problem(const char *filename);
+void do_cross_validation();
+
+struct svm_parameter param; // set by parse_command_line
+struct svm_problem prob; // set by read_problem
+struct svm_model *model;
+struct svm_node *x_space;
+int cross_validation;
+int nr_fold;
+
+static char *line = NULL;
+static int max_line_len;
+
+static char* readline(FILE *input)
+{
+ int len;
+
+ if(fgets(line,max_line_len,input) == NULL)
+ return NULL;
+
+ while(strrchr(line,'\n') == NULL)
+ {
+ max_line_len *= 2;
+ line = (char *) realloc(line,max_line_len);
+ len = (int) strlen(line);
+ if(fgets(line+len,max_line_len-len,input) == NULL)
+ break;
+ }
+ return line;
+}
+
+int main(int argc, char **argv)
+{
+ char input_file_name[1024];
+ char model_file_name[1024];
+ const char *error_msg;
+
+ parse_command_line(argc, argv, input_file_name, model_file_name);
+ read_problem(input_file_name);
+ error_msg = svm_check_parameter(&prob,&param);
+
+ if(error_msg)
+ {
+ fprintf(stderr,"Error: %s\n",error_msg);
+ exit(1);
+ }
+
+ if(cross_validation)
+ {
+ do_cross_validation();
+ }
+ else
+ {
+ model = svm_train(&prob,&param);
+ svm_save_model(model_file_name,model);
+ svm_destroy_model(model);
+ }
+ svm_destroy_param(&param);
+ free(prob.y);
+ free(prob.x);
+ free(x_space);
+ free(line);
+
+ return 0;
+}
+
+void do_cross_validation()
+{
+ int i;
+ int total_correct = 0;
+ double total_error = 0;
+ double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0;
+ double *target = Malloc(double,prob.l);
+
+ svm_cross_validation(&prob,&param,nr_fold,target);
+ if(param.svm_type == EPSILON_SVR ||
+ param.svm_type == NU_SVR)
+ {
+ for(i=0;i<prob.l;i++)
+ {
+ double y = prob.y[i];
+ double v = target[i];
+ total_error += (v-y)*(v-y);
+ sumv += v;
+ sumy += y;
+ sumvv += v*v;
+ sumyy += y*y;
+ sumvy += v*y;
+ }
+ printf("Cross Validation Mean squared error = %g\n",total_error/prob.l);
+ printf("Cross Validation Squared correlation coefficient = %g\n",
+ ((prob.l*sumvy-sumv*sumy)*(prob.l*sumvy-sumv*sumy))/
+ ((prob.l*sumvv-sumv*sumv)*(prob.l*sumyy-sumy*sumy))
+ );
+ }
+ else
+ {
+ for(i=0;i<prob.l;i++)
+ if(target[i] == prob.y[i])
+ ++total_correct;
+ printf("Cross Validation Accuracy = %g%%\n",100.0*total_correct/prob.l);
+ }
+ free(target);
+}
+
+void parse_command_line(int argc, char **argv, char *input_file_name, char *model_file_name)
+{
+ int i;
+
+ // default values
+ param.svm_type = C_SVC;
+ param.kernel_type = GAUSSIAN;
+ param.degree = 3;
+ param.gamma = 0; // 1/num_features
+ param.coef0 = 0;
+ param.nu = 0.5;
+ param.cache_size = 100;
+ param.C = 1;
+ param.eps = 1e-3;
+ param.p = 0.1;
+ param.shrinking = 1;
+ param.probability = 0;
+ param.nr_weight = 0;
+ param.weight_label = NULL;
+ param.weight = NULL;
+ cross_validation = 0;
+
+ // parse options
+ for(i=1;i<argc;i++)
+ {
+ if(argv[i][0] != '-') break;
+ if(++i>=argc)
+ exit_with_help();
+ switch(argv[i-1][1])
+ {
+ case 's':
+ param.svm_type = atoi(argv[i]);
+ break;
+ case 't':
+ param.kernel_type = atoi(argv[i]);
+ break;
+ case 'd':
+ param.degree = atoi(argv[i]);
+ break;
+ case 'g':
+ param.gamma = atof(argv[i]);
+ break;
+ case 'r':
+ param.coef0 = atof(argv[i]);
+ break;
+ case 'n':
+ param.nu = atof(argv[i]);
+ break;
+ case 'm':
+ param.cache_size = atof(argv[i]);
+ break;
+ case 'c':
+ param.C = atof(argv[i]);
+ break;
+ case 'e':
+ param.eps = atof(argv[i]);
+ break;
+ case 'p':
+ param.p = atof(argv[i]);
+ break;
+ case 'h':
+ param.shrinking = atoi(argv[i]);
+ break;
+ case 'b':
+ param.probability = atoi(argv[i]);