# Naive Bayes Algorithm

A machine learning tutorial from https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/

Reimplemented in Scala

In [31]:
def loadCsv(filename: String): List[List[Double]] = {
    val bufferedSource = io.Source.fromFile(filename)
    val rows = ArrayBuffer[List[Double]]()
    for (line <- bufferedSource.getLines) {
        rows += line.split(",").map(_.trim).map(_.toDouble).toList
    }
    bufferedSource.close
    rows.toList
}

defined [32mfunction[39m [36mloadCsv[39m

In [3]:
import util.Random

def splitDataset(dataset: List[List[Double]], splitRatio: Double): (List[List[Double]], List[List[Double]]) = {
    val shuffled = Random.shuffle(dataset)
    shuffled.splitAt((dataset.length * splitRatio).toInt)
}

[32mimport [39m[36mutil.Random

[39m
defined [32mfunction[39m [36msplitDataset[39m

In [1]:
def separateByClass(dataset: List[List[Double]]): Map[Double, List[List[Double]]] = dataset.groupBy(_.last)

defined [32mfunction[39m [36mseparateByClass[39m

In [2]:
// TODO: validate this implementation, is the python one better?
def calcMeanStd(x: List[Double]): (Double, Double) = {
    @scala.annotation.tailrec
    def meanStd(x: List[Double], mu: Double, Q: Double, count: Int): (Double, Double) = {
        if (count >= x.length) (mu, Math.sqrt(Q / x.length))
        else {
            val newCount = count + 1
            val newMu = x(count) / newCount + mu * (1.0 - 1.0 / newCount)
            val newQ = Q + (x(count) - mu) * (x(count) - newMu)
            meanStd(x, newMu, newQ, newCount)   
        }        
    }

    meanStd(x, 0.0, 0.0, 0)    
}

def mean(numbers: List[Double]): Double = calcMeanStd(numbers)._1
def stdev(numbers: List[Double]): Double = calcMeanStd(numbers)._2

defined [32mfunction[39m [36mcalcMeanStd[39m
defined [32mfunction[39m [36mmean[39m
defined [32mfunction[39m [36mstdev[39m

In [3]:
def summarize(dataset: List[List[Double]]): List[(Double, Double)] = dataset.transpose.dropRight(1).map(calcMeanStd)

defined [32mfunction[39m [36msummarize[39m

In [4]:
def summarizeByClass(dataset: List[List[Double]]): Map[Double, List[(Double, Double)]] = separateByClass(dataset).map {
    case (a, b) => (a, summarize(b))
}

defined [32mfunction[39m [36msummarizeByClass[39m

In [1]:
import scala.math.{exp, pow, sqrt, Pi}
def calculateProbability(x: Double, mean: Double, stdev: Double): Double = {
    val exponent = exp(-(pow(x - mean, 2) / (2 * pow(stdev, 2))))
    (1 / (math.sqrt(2 * Pi) * stdev)) * exponent
}

[32mimport [39m[36mscala.math.{exp, pow, sqrt, Pi}
[39m
defined [32mfunction[39m [36mcalculateProbability[39m

In [2]:
def calculateClassProbabilities(summaries: Map[Double, List[(Double, Double)]], inputVector: List[Double]) = {
    summaries.map{ case (classValue, classSummaries) => (classValue, classSummaries.foldLeft(1.0) {
        (acc, tup) => calculateProbability(inputVector.head, tup._1, tup._2)
    } ) }
}

defined [32mfunction[39m [36mcalculateClassProbabilities[39m

In [3]:
def predict(summaries: Map[Double, List[(Double, Double)]], inputVector: List[Double]) = {
    calculateClassProbabilities(summaries, inputVector).maxBy(_._2)._1
}

defined [32mfunction[39m [36mpredict[39m

In [5]:
def getPredictions(summaries: Map[Double, List[(Double, Double)]], inputVector: List[List[Double]]) = {
    for (iv <- inputVector) yield predict(summaries, iv)
}

defined [32mfunction[39m [36mgetPredictions[39m

In [8]:
val summaries = Map(1.0 -> List((1.0, 0.5)), 2.0 -> List((20.0, 50.0)))
val inputVector = List(List(1.1, 2.0), List(19.1, 2.0))
getPredictions(summaries, inputVector)

[36msummaries[39m: [32mMap[39m[[32mDouble[39m, [32mList[39m[([32mDouble[39m, [32mDouble[39m)]] = [33mMap[39m([32m1.0[39m -> [33mList[39m(([32m1.0[39m, [32m0.5[39m)), [32m2.0[39m -> [33mList[39m(([32m20.0[39m, [32m50.0[39m)))
[36minputVector[39m: [32mList[39m[[32mList[39m[[32mDouble[39m]] = [33mList[39m([33mList[39m([32m1.1[39m, [32m2.0[39m), [33mList[39m([32m19.1[39m, [32m2.0[39m))
[36mres7_2[39m: [32mList[39m[[32mDouble[39m] = [33mList[39m([32m1.0[39m, [32m2.0[39m)