Our dependencies:

In [None]:
%maven nz.ac.waikato.cms.weka:weka-dev:3.9.4
%maven com.github.fracpete:missing-values-imputation-weka-package:2016.6.12

The following environment variables are used for determining input and output:

* `INPUT` - the input file
* `OUTPUT` - the output file
* `VERBOSE` - whether to output some debugging information (boolean: true|false)

For testing purposes, we set them to manual values if not present:

In [None]:
String input_file;
String output_file;
boolean verbose;
if (System.getenv("INPUT") != null) {
  input_file = System.getenv("INPUT");
} else {
  input_file = "/home/fracpete/development/datasets/uci/numeric/bolts.arff";
}
if (System.getenv("OUTPUT") != null) {
  output_file = System.getenv("OUTPUT");
} else {
  output_file = "/home/fracpete/temp/bolts_clean.arff";
}
if (System.getenv("VERBOSE") != null) {
  verbose = Boolean.parseBoolean(System.getenv("VERBOSE"));
} else {
  verbose = true;
}
if (verbose) {
  System.out.println("INPUT=" + input_file);
  System.out.println("OUTPUT=" + output_file);
}

Load data:

In [None]:
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.Instances;

Instances data = DataSource.read(input_file);
data.setClassIndex(data.numAttributes() - 1);

// output incoming number of instances
if (verbose) {
  System.out.println("# instances input: " + data.numInstances());
}

Clean data by removing all rows that got flagged as outlier or extreme value:

In [None]:
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.MultiFilter;
import weka.filters.unsupervised.attribute.InterquartileRange;
import weka.filters.unsupervised.attribute.Remove;
import weka.filters.unsupervised.instance.RemoveWithValues;

// IQR: for flagging outliers/extreme values
InterquartileRange iqr = new InterquartileRange();
iqr.setOptions(new String[]{"-R", "7"});   // additional options

// removing rows with outlier=yes
RemoveWithValues removeOut = new RemoveWithValues();
removeOut.setOptions(new String[]{"-C", "9", "-L", "2"});

// removing rows with extreme=yes
RemoveWithValues removeExt = new RemoveWithValues();
removeExt.setOptions(new String[]{"-C", "10", "-L", "2"});

// removing attributes "Outlier" and "ExtremeValue"
Remove removeAtts = new Remove();
removeAtts.setOptions(new String[]{"-R", "9,10"});

// combining filters into pipeline
MultiFilter multi = new MultiFilter();
multi.setFilters(new Filter[]{iqr, removeOut, removeExt, removeAtts});
// output commandline
if (verbose) {
  System.out.println("Filter pipeline:\n" + Utils.toCommandLine(multi));
}

// filter data
multi.setInputFormat(data);
Instances filtered = Filter.useFilter(data, multi);

// renaming dataset
filtered.setRelationName(data.relationName() + "-clean");

// output remaining number of instances
if (verbose) {
  System.out.println("# instances output: " + filtered.numInstances());
}

Save data:

In [None]:
import weka.core.converters.ConverterUtils.DataSink;

DataSink.write(output_file, filtered);