# Naive Bayes Algorithm

A machine learning tutorial from https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/

Reimplemented in Scala

In [31]:
def loadCsv(filename: String): List[List[Double]] = {
    val bufferedSource = io.Source.fromFile(filename)
    val rows = ArrayBuffer[List[Double]]()
    for (line <- bufferedSource.getLines) {
        rows += line.split(",").map(_.trim).map(_.toDouble).toList
    }
    bufferedSource.close
    rows.toList
}

defined [32mfunction[39m [36mloadCsv[39m

In [3]:
import util.Random

def splitDataset(dataset: List[List[Double]], splitRatio: Double): (List[List[Double]], List[List[Double]]) = {
    val shuffled = Random.shuffle(dataset)
    shuffled.splitAt((dataset.length * splitRatio).toInt)
}

[32mimport [39m[36mutil.Random

[39m
defined [32mfunction[39m [36msplitDataset[39m

In [1]:
def separateByClass(dataset: List[List[Double]]): Map[Double, List[List[Double]]] = dataset.groupBy(_.last)

defined [32mfunction[39m [36mseparateByClass[39m

In [2]:
// TODO: validate this implementation, is the python one better?
def calcMeanStd(x: List[Double]): (Double, Double) = {
    @scala.annotation.tailrec
    def meanStd(x: List[Double], mu: Double, Q: Double, count: Int): (Double, Double) = {
        if (count >= x.length) (mu, Math.sqrt(Q / x.length))
        else {
            val newCount = count + 1
            val newMu = x(count) / newCount + mu * (1.0 - 1.0 / newCount)
            val newQ = Q + (x(count) - mu) * (x(count) - newMu)
            meanStd(x, newMu, newQ, newCount)   
        }        
    }

    meanStd(x, 0.0, 0.0, 0)    
}

def mean(numbers: List[Double]): Double = calcMeanStd(numbers)._1
def stdev(numbers: List[Double]): Double = calcMeanStd(numbers)._2

defined [32mfunction[39m [36mcalcMeanStd[39m
defined [32mfunction[39m [36mmean[39m
defined [32mfunction[39m [36mstdev[39m

In [3]:
def summarize(dataset: List[List[Double]]): List[(Double, Double)] = dataset.transpose.dropRight(1).map(calcMeanStd)

defined [32mfunction[39m [36msummarize[39m

In [4]:
def summarizeByClass(dataset: List[List[Double]]): Map[Double, List[(Double, Double)]] = separateByClass(dataset).map {
    case (a, b) => (a, summarize(b))
}

defined [32mfunction[39m [36msummarizeByClass[39m

In [5]:
val dataset = List(List(1.0, 20.0, 1.0), List(2.0, 21.0, 0.0), List(3.0, 22.0, 1.0), List(4.0, 22.0, 0))
summarizeByClass(dataset)

[36mdataset[39m: [32mList[39m[[32mList[39m[[32mDouble[39m]] = [33mList[39m(
  [33mList[39m([32m1.0[39m, [32m20.0[39m, [32m1.0[39m),
  [33mList[39m([32m2.0[39m, [32m21.0[39m, [32m0.0[39m),
  [33mList[39m([32m3.0[39m, [32m22.0[39m, [32m1.0[39m),
  [33mList[39m([32m4.0[39m, [32m22.0[39m, [32m0.0[39m)
)
[36mres4_1[39m: [32mMap[39m[[32mDouble[39m, [32mList[39m[([32mDouble[39m, [32mDouble[39m)]] = [33mMap[39m([32m1.0[39m -> [33mList[39m(([32m2.0[39m, [32m1.0[39m), ([32m21.0[39m, [32m1.0[39m)), [32m0.0[39m -> [33mList[39m(([32m3.0[39m, [32m1.0[39m), ([32m21.5[39m, [32m0.5[39m)))