In [1]:
!pip install jpype1
import jpype as jp
import jpype.imports

jp.startJVM(classpath = ['weka/*'])  # launch JVM with custom classpath

# import java stuff for tree visualization
from java.io import File
from java.awt import BorderLayout, Color
from java.awt.image import BufferedImage
from javax.swing import JFrame
from javax.imageio import ImageIO

def visualize_J48_tree(j48_model, width=500, height=400, title='J48 tree'):
    jf = JFrame(title)
    jf.setSize(width, height);
    jf.getContentPane().setLayout(BorderLayout())
    tv = jp.JClass('weka.gui.treevisualizer.TreeVisualizer')(None, j48.graph(), jp.JClass('weka.gui.treevisualizer.PlaceNode2')())
    tv.setBackground(Color.white)
    content = jf.getContentPane().add(tv, BorderLayout.CENTER);
    jf.setVisible(True)
    tv.fitToScreen()



## Simplest use case: build J48 tree and visualize

In [2]:
# load data from ARFF
# source = jp.JClass('weka.core.converters.ConverterUtils$DataSource')('weka/data/iris.arff')
# instances = source.getDataSet()

# or load from CSV
csv_loader = jp.JClass('weka.core.converters.CSVLoader')()
csv_loader.setSource(File('weka/data/iris.csv'))
instances = csv_loader.getDataSet()

# set last attribute as class 
classIndex = instances.numAttributes()-1  
instances.setClassIndex(classIndex)

# create learner
j48 = jp.JClass('weka.classifiers.trees.J48')()
j48.setOptions(['-C', '0.25', '-M', '2'])

# train model, print it and visualize
j48.buildClassifier(instances)
print(j48.toString())
visualize_J48_tree(j48, width=600, height=500)

J48 pruned tree
------------------

petalwidth <= 0.6: Iris-setosa (50.0)
petalwidth > 0.6
|   petalwidth <= 1.7
|   |   petallength <= 4.9: Iris-versicolor (48.0/1.0)
|   |   petallength > 4.9
|   |   |   petalwidth <= 1.5: Iris-virginica (3.0)
|   |   |   petalwidth > 1.5: Iris-versicolor (3.0/1.0)
|   petalwidth > 1.7: Iris-virginica (46.0/1.0)

Number of Leaves  : 	5

Size of the tree : 	9



## Cross validation

In [3]:
from java.lang import StringBuffer
from java.util import Random

buffer = StringBuffer()
out = jp.JClass('weka.classifiers.evaluation.output.prediction.PlainText')()
out.setSuppressOutput(True)
out.setBuffer(buffer)

rnd = Random()
cv = jp.JClass('weka.classifiers.Evaluation')(instances)
args = jp.JArray(jp.JClass('java.lang.Object'), 1)(1)
args[0] = out

learner = jp.JClass('weka.classifiers.trees.J48')()
learner.setOptions(['-C', '0.25', '-M', '2'])
nfolds = 10

cv.crossValidateModel(learner, instances, nfolds, rnd, args)
print(cv.toSummaryString())
print(cv.toMatrixString())
print(cv.toClassDetailsString())


Correctly Classified Instances         143               95.3333 %
Incorrectly Classified Instances         7                4.6667 %
Kappa statistic                          0.93  
Mean absolute error                      0.0392
Root mean squared error                  0.1749
Relative absolute error                  8.8149 %
Root relative squared error             37.1099 %
Total Number of Instances              150     

=== Confusion Matrix ===

  a  b  c   <-- classified as
 49  1  0 |  a = Iris-setosa
  0 47  3 |  b = Iris-versicolor
  0  3 47 |  c = Iris-virginica

=== Detailed Accuracy By Class ===

                 TP Rate  FP Rate  Precision  Recall   F-Measure  MCC      ROC Area  PRC Area  Class
                 0.980    0.000    1.000      0.980    0.990      0.985    0.990     0.987     Iris-setosa
                 0.940    0.040    0.922      0.940    0.931      0.896    0.938     0.858     Iris-versicolor
                 0.940    0.030    0.940      0.940    0.940      

## Weighting instances

In [4]:
# let's increase the weight of all "Iris-setosa" instances to 2.0 instead of default 1.0
weighted_instances = jp.JClass('weka.core.converters.ConverterUtils$DataSource')('weka/data/iris.arff').getDataSet()
weighted_instances.setClassIndex(instances.numAttributes()-1)

for x in weighted_instances:
    if x.classAttribute().value(int(x.classValue())) == 'Iris-setosa':
        x.setWeight(2.0)