# Naive Bayes Algorithm

A machine learning tutorial from https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/

Reimplemented in Scala

In [2]:
import scala.collection.mutable.ArrayBuffer

def loadCsv(filename: String): List[List[Double]] = {
    val bufferedSource = io.Source.fromFile(filename)
    val rows = ArrayBuffer[List[Double]]()
    for (line <- bufferedSource.getLines) {
        rows += line.split(",").map(_.trim).map(_.toDouble).toList
    }
    bufferedSource.close
    rows.toList
}

[32mimport [39m[36mscala.collection.mutable.ArrayBuffer

[39m
defined [32mfunction[39m [36mloadCsv[39m

In [3]:
import util.Random

def splitDataset(dataset: List[List[Double]], splitRatio: Double): (List[List[Double]], List[List[Double]]) = {
    val shuffled = Random.shuffle(dataset)
    shuffled.splitAt((dataset.length * splitRatio).toInt)
}

[32mimport [39m[36mutil.Random

[39m
defined [32mfunction[39m [36msplitDataset[39m

In [4]:
def separateByClass(dataset: List[List[Double]]): Map[Double, List[List[Double]]] = dataset.groupBy(_.last)

defined [32mfunction[39m [36mseparateByClass[39m

In [5]:
// TODO: validate this implementation, is the python one better?
def calcMeanStd(x: List[Double]): (Double, Double) = {
    @scala.annotation.tailrec
    def meanStd(x: List[Double], mu: Double, Q: Double, count: Int): (Double, Double) = {
        if (count >= x.length) (mu, Math.sqrt(Q / x.length))
        else {
            val newCount = count + 1
            val newMu = x(count) / newCount + mu * (1.0 - 1.0 / newCount)
            val newQ = Q + (x(count) - mu) * (x(count) - newMu)
            meanStd(x, newMu, newQ, newCount)   
        }        
    }

    meanStd(x, 0.0, 0.0, 0)    
}

def mean(numbers: List[Double]): Double = calcMeanStd(numbers)._1
def stdev(numbers: List[Double]): Double = calcMeanStd(numbers)._2

defined [32mfunction[39m [36mcalcMeanStd[39m
defined [32mfunction[39m [36mmean[39m
defined [32mfunction[39m [36mstdev[39m

In [6]:
def summarize(dataset: List[List[Double]]): List[(Double, Double)] = dataset.transpose.dropRight(1).map(calcMeanStd)

defined [32mfunction[39m [36msummarize[39m

In [7]:
def summarizeByClass(dataset: List[List[Double]]): Map[Double, List[(Double, Double)]] = separateByClass(dataset).map {
    case (a, b) => (a, summarize(b))
}

defined [32mfunction[39m [36msummarizeByClass[39m

In [8]:
import scala.math.{exp, pow, sqrt, Pi}
def calculateProbability(x: Double, mean: Double, stdev: Double): Double = {
    val exponent = exp(-(pow(x - mean, 2) / (2 * pow(stdev, 2))))
    (1 / (math.sqrt(2 * Pi) * stdev)) * exponent
}

[32mimport [39m[36mscala.math.{exp, pow, sqrt, Pi}
[39m
defined [32mfunction[39m [36mcalculateProbability[39m

In [9]:
def calculateClassProbabilities(summaries: Map[Double, List[(Double, Double)]], inputVector: List[Double]) = {
    summaries.map{ case (classValue, classSummaries) => (classValue, classSummaries.foldLeft(1.0) {
        (acc, tup) => calculateProbability(inputVector.head, tup._1, tup._2)
    } ) }
}

defined [32mfunction[39m [36mcalculateClassProbabilities[39m

In [10]:
def predict(summaries: Map[Double, List[(Double, Double)]], inputVector: List[Double]) = {
    calculateClassProbabilities(summaries, inputVector).maxBy(_._2)._1
}

defined [32mfunction[39m [36mpredict[39m

In [11]:
def getPredictions(summaries: Map[Double, List[(Double, Double)]], inputVector: List[List[Double]]) = {
    for (iv <- inputVector) yield predict(summaries, iv)
}

defined [32mfunction[39m [36mgetPredictions[39m

In [12]:
def getAccuracy(testSet: List[List[Double]], predictions: List[Double]): Double = {
    val correct = for ( (data: List[Double], prediction: Double) <- testSet zip predictions) yield data.last == prediction
    (correct.filter(_ == true).length / testSet.length.toDouble) * 100
}

defined [32mfunction[39m [36mgetAccuracy[39m

In [13]:
val testSet = List(List(1.0, 1.0, 1.0, 0.0), List(2.0, 2.0, 2.0, 0.0), List(3.0, 3.0, 3.0, 1.0))
val predictions = List(0.0, 0.0, 0.0)
getAccuracy(testSet, predictions)

[36mtestSet[39m: [32mList[39m[[32mList[39m[[32mDouble[39m]] = [33mList[39m(
  [33mList[39m([32m1.0[39m, [32m1.0[39m, [32m1.0[39m, [32m0.0[39m),
  [33mList[39m([32m2.0[39m, [32m2.0[39m, [32m2.0[39m, [32m0.0[39m),
  [33mList[39m([32m3.0[39m, [32m3.0[39m, [32m3.0[39m, [32m1.0[39m)
)
[36mpredictions[39m: [32mList[39m[[32mDouble[39m] = [33mList[39m([32m0.0[39m, [32m0.0[39m, [32m0.0[39m)
[36mres12_2[39m: [32mDouble[39m = [32m66.66666666666666[39m

In [24]:
def main() = {
    val filename = "pima-indians-diabetes.data.csv"
    val splitRatio = 0.67
    val dataset = loadCsv(filename)
    val (trainingSet, testSet) = splitDataset(dataset, splitRatio)
    print(s"Split ${dataset.length} rows into train=${trainingSet.length} and test=${testSet.length} rows\n")
    val summaries = summarizeByClass(trainingSet)
    val predictions = getPredictions(summaries, testSet)
    val accuracy = getAccuracy(testSet, predictions)
    print(s"Accuracy: $accuracy")
}

defined [32mfunction[39m [36mmain[39m

In [25]:
main()

Split 768 rows into train=514 and test=254 rows
Accuracy: 68.11023622047244