# Pulling Weka evaluations
#### We run each feature vector on several classifiers
#### For later analysis, we saved the Accuracy metric results into csv files
---
#### For convenience, further more options of Weka analysis are can be found at the bottom of the code (marked as comments)

In [None]:
!pip install python-javabridge --quiet
!pip install python-weka-wrapper3 --quiet

In [None]:
from os import path
from weka.core.converters import Loader
from weka.classifiers import Evaluation, Classifier, PredictionOutput
import weka.core.jvm as jvm
import weka.core.converters as conv
from weka.core.classes import Random
import weka.plot.classifiers as plcls  
import weka.plot.experiments as plexp  

import os
import pandas as pd
from tqdm import tqdm

In [None]:
data_dir  = ''

inputs_dir = path.join(main_dir, 'inputs')

<div class="alert alert-block alert-info">
<b>Note:</b>  'inputs' directory, which is the result of 'vectors.ipynb', should be stored in the same directory as the notebook
</div>

In [None]:
# start jvm
jvm.start(packages=True)

In [None]:
def load_data(dir, name):
  data = conv.load_any_file(path.join(dir, name))
  data.class_is_last()
  return data

In [None]:
def configure_classifier(classname):
  cls = Classifier(classname="weka.classifiers." + classname)
  cls.build_classifier(data)
  return cls

In [None]:
def cross_valid_evaluate(cls, data):
  pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
  evaluation = Evaluation(data)
  evaluation.crossvalidate_model(cls, data, 10, Random(1), pout)
  return evaluation

In [None]:
def get_accuaracy(evaluation):
  return evaluation.summary().split('\n')[1].split('               ')[-1].split(' %')[0]

In [None]:
accuracies_dict = {'inputs':{},
                   'inputs_rand':{},
                   'inputs_reg':{}}

dir_names = ['inputs', 'inputs_rand', 'inputs_reg']

cls_names = ['functions.SimpleLogistic', 'functions.MultilayerPerceptron', 'bayes.NaiveBayes']

In [None]:

for dir_name in tqdm(dir_names):
  dir = path.join(main_dir, dir_name)

  for file in sorted(os.listdir(dir)):
    file_name = file.split('.csv')[0]
    accuracies_dict[dir_name].update({file_name: {}})

    for cls_name in cls_names:
      data = load_data(dir, file)
      cls = configure_classifier(cls_name)
      evaluation = cross_valid_evaluate(cls, data)
      accuracies_dict[dir_name][file_name].update({cls_name : get_accuaracy(evaluation)})




In [None]:
# stop jvm
jvm.stop()

In [None]:
df_inputs = pd.DataFrame.from_dict(accuracies_dict['inputs'], orient='index')
df_inputs_rand = pd.DataFrame.from_dict(accuracies_dict['inputs_rand'], orient='index')
df_inputs_reg = pd.DataFrame.from_dict(accuracies_dict['inputs_reg'], orient='index')

df_inputs

In [None]:
df_inputs_rand

In [None]:
df_inputs_reg

Unnamed: 0,functions.SimpleLogistic,functions.MultilayerPerceptron,bayes.NaiveBayes
morph_lemma2,32.9114,45.5696,37.9747
morph_lemma4,37.9747,39.2405,41.7722
morph_syn2,30.3797,26.5823,25.3165
morph_syn4,27.8481,24.0506,34.1772
morpho2,39.2405,44.3038,44.3038
morpho4,40.5063,35.443,43.038
syn_lemma2,37.9747,40.5063,43.038
syn_lemma4,41.7722,49.3671,43.038
syntax2,32.9114,41.7722,35.443
syntax4,32.9114,35.443,34.1772


# Save to Excel files in dir 'summary'

In [None]:
if not path.exists(path.join(main_dir,'summary')):
    os.mkdir(path.join(main_dir,'summary'))
df_inputs.to_excel(path.join(main_dir,'summary', 'inputs_sum.xlsx'))
df_inputs_rand.to_excel(path.join(main_dir,'summary','inputs_rand_sum.xlsx'))
df_inputs_reg.to_excel(path.join(main_dir,'summary','inputs_reg_sum.xlsx'))

In [None]:
# print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
# print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
# print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
# print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
# print("avgCost: " + str(evaluation.avg_cost))
# print("totalCost: " + str(evaluation.total_cost))
# print("confusionMatrix: " + str(evaluation.confusion_matrix))
# print("correct: " + str(evaluation.correct))
# print("pctCorrect: " + str(evaluation.percent_correct))
# print("incorrect: " + str(evaluation.incorrect))
# print("pctIncorrect: " + str(evaluation.percent_incorrect))
# print("unclassified: " + str(evaluation.unclassified))
# print("pctUnclassified: " + str(evaluation.percent_unclassified))
# print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
# print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
# print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
# print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
# print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
# print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
# print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
# print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
# print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
# print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
# print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
# print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
# print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
# print("numTruePositives: " + str(evaluation.num_true_positives(1)))
# print("fMeasure: " + str(evaluation.f_measure(1)))
# print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
# print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
# print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
# print("precision: " + str(evaluation.precision(1)))
# print("weightedPrecision: " + str(evaluation.weighted_precision))
# print("recall: " + str(evaluation.recall(1)))
# print("weightedRecall: " + str(evaluation.weighted_recall))
# print("kappa: " + str(evaluation.kappa))
# print("KBInformation: " + str(evaluation.kb_information))
# print("KBMeanInformation: " + str(evaluation.kb_mean_information))
# print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
# print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
# print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
# print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
# print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
# print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
# print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
# print("class priors: " + str(evaluation.class_priors))
# print("numInstances: " + str(evaluation.num_instances))
# print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
# print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
# print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
# print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
# print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
# print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
# print("prediction output:\n" + str(pout))

# #plot ROC

# plcls.plot_roc(evaluation, class_index=[0, 1], wait=True)
# print()
# # plot errors
# plcls.plot_classifier_errors(evaluation.predictions, absolute=False, wait=True)


