In [None]:
# Boilerplate
%matplotlib inline

# Intel DAAL related imports
from daal.data_management import (
    DataSourceIface, FileDataSource, HomogenNumericTable, CSRNumericTable, NumericTable, BlockDescriptor
)

# Helpersfor getArrayFromNT and printNT. See utils.py
from utils import *

# Import numpy, matplotlib, seaborn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Plotting configurations
%config InlineBackend.figure_format = 'retina'
plt.rcParams["figure.figsize"] = (12, 9)

# Online Multinomial Naive Bayes

### Tutorial brief
This tutorial is an example of using Naive Bayes algorithms from pyDAAL to build predictive models.
We use the well-studied 20 Newsgroups dataset to train Multinomial Naive Bayes model in online processing mode. We  test the accuracy of the model using quality metrics for multi-class classification. The code for Multinomial Naive Bayes model training and prediction is provided partially. You are required to fill in the gaps.

### Learning objectives
* To understand how to process the sparse data that doen not fit into memory using online computing mode. 
* To understand and practice the typical code sequence of using pyDAAL for classification.
* To understand how to measuring the quality of the trained model.


### Multinomial Naive Bayes introduction
Supervised learning involves training a model using the data that has known responses, and then apply the model to predict responses for unseen data. In the case of **Multinomial Naive Bayes** classifier, the model is probabilistic.

Let $J$ be the number of classes, indexed $k = 0, 1, \ldots, J-1$. The feature vector $x_i = (x_{i1}, \ldots, x_{ip})$, $i = 1, \ldots, n$, contains scaled frequencies: the value of $x_{ij}$ is the frequency of the $j$-th feature is observed in the vector $x_i$. In terms of the document classification problem, $x_{ij}$ is the frequency of occurrence of the word indexed $j$ in the document $x_i$.
The response $y_i$ is the index of the class, $y_i \in {0, 1, \ldots, J-1}$ corresponding to the document $x_i$.

On the training stage the probability estimates of the occurense on the word $i$ in the document class $k$ are computed:
$$log(\theta_{ki}) = log\bigg( \frac{N_{ki} + \alpha_k}{N_k + \alpha}\bigg)$$
where
$$N_{ki} = \sum \limits_{x: x \in X, y(x) = k} x_i, N_k = \sum \limits_{i = 1}^m N_i$$

On the prediction stage, given a new feature vector $x$ , the classifier determines the class the vector belongs to:
$$y(x) = argmax_{k \in \{0, \ldots, J-1\}} \Big(log(p(y=k)) + \sum \limits_{i} log(\theta_{ki})\Big)$$

The details about the algorithm: ["Tackling the Poor Assumptions of Naive Bayes Text Classifiers" by Jason D. M. Rennie et al.](https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf)

### 20 Newsgroups dataset
The dataset has already been downloaded to the ./mldata folder. There are 11314 training and 7514 testing samples (documents) and 130107 features (words). Here's the detailed information about this dataset, including descriptions of each class:

> Origin: 

> This dataset was collected in the School of Computer Science, Carnegie Mellon University. 

> Creator: 

> Tom Mitchell

> Data Set Information:

> Concerns 18828 messages from 20 newsgroups.

> Information about classes:

> 1.  alt.atheism
> 2.  comp.graphics
> 3.  comp.os.ms-windows.misc
> 4.  comp.sys.ibm.pc.hardware
> 5.  comp.sys.mac.hardware
> 6.  comp.windows.x
> 7.  misc.forsale
> 8.  rec.autos
> 9.  rec.motorcycles
> 10. rec.sport.baseball
> 11. rec.sport.hockey
> 12. sci.crypt
> 13. sci.electronics
> 14. sci.med
> 15. sci.space
> 16. soc.religion.christian
> 17. talk.politics.guns
> 18. talk.politics.mideast
> 19. talk.politics.misc
> 20. talk.religion.misc

> Words examples: archive, name, atheism, resources, alt, last, modified, december, version, atheist, addresses, of, organizations, usa, freedom

### Load sparse numeric table from file

In [None]:
def createSparseTable(file, nFeatures):
    rowIdx = []
    colIdx = []
    data = []
    with open(file, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        for row in datareader:
            rowIdx.append(int(row[0]))
            colIdx.append(int(row[1]))
            data.append(float(row[2]))

    rowIdx = np.array(rowIdx)
    rowIdx = rowIdx - rowIdx[0]

    nObservations = rowIdx[len(data)-1] + 1
    cooData = coo_matrix((data, (rowIdx, colIdx)), shape=(nObservations, nFeatures))
    csrDara = cooData.tocsr()
    table = CSRNumericTable(csrDara.data.astype(np.float64), csrDara.indices.astype(np.uint64) + 1, csrDara.indptr.astype(np.uint64) + 1,
                            int(nFeatures), int(nObservations))
    return table

### Multinomial Naive Bayes model training on 20 Newsgroups dataset
The training data is split into 5 blocks, ~2200 samples each. For each block of data the code below does following:
- Reads the data in coordinate format from files `/mldata/20newsgroups.coo.<block>.csv` and creates a CSRNumericTable with training data (`xTrain`)
- Reads the  ground truth into dense NumericTable (`yTrain`)
- Updates the training result with a new block of data

In [None]:
from scipy.sparse import coo_matrix
from daal.algorithms import classifier
from daal.algorithms.multinomial_naive_bayes import training as nb_training

# Number of blocks of data in the training data set
nBlocks = 5
# Number of classes
nClasses = 20
# Number of words in the documents
nFeatures = 130107

# Create an algorithm object to train Multinomial Naive Bayes model in online processing mode
nbTrain = nb_training.Online(nClasses, method=nb_training.fastCSR)

for i in range(nBlocks):
    # Load new block of data from CSV file
    xTrain = createSparseTable('./mldata/20newsgroups.coo.' + str(i + 1) + '.csv', nFeatures)
    # Load new block of labels from CSV file
    labelsDataSource = FileDataSource(
        './mldata/20newsgroups.labels.' + str(i + 1) + '.csv',
        DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext
    )
    labelsDataSource.loadDataBlock()
    yTrain = labelsDataSource.getNumericTable()

    # Set input
    #
    # YOUR CODE HERE
    #
    # There are two pieces of input to be set: data and labels. You should
    # use the 'input.set' member methods of the nbTrain algorithm object.
    # The input IDs to use are 'classifier.training.data' and 'classifier.training.labels'
    # respectively.

    # Compute
    #
    # YOUR CODE HERE
    #
    # Call the 'compute()' method of your algorithm object to update the partial model.

model = nbTrain.finalizeCompute().get(classifier.training.model)

### Prediction with Multinomial Naive Bayes model

The code below gets the training data from `sklearn` 20 Newsgroups dataset and creates 2 NumericTables: test data in CSR format (xTest) and test ground truth (yTest). We use Multinomial Naive Bayes prediction algorithm and the model obtained on the training stage to compute the predictions for a new, prevoiusly unseen data.

In [None]:
import sklearn.datasets as ds
from daal.algorithms.multinomial_naive_bayes import prediction as nb_prediction

newsgroups_test = ds.fetch_20newsgroups_vectorized(subset='test')

testData = newsgroups_test.data

xTest = CSRNumericTable(testData.data.astype(np.float64), testData.indices.astype(np.uint64) + 1, testData.indptr.astype(np.uint64) + 1,
                        int(testData.shape[1]), int(testData.shape[0]))
yTest = newsgroups_test.target

# Create an algorithm object to predict Multinomial Naive Bayes values
nbTest = nb_prediction.Batch(nClasses, method=nb_prediction.fastCSR)

# Pass a testing data set and the trained model to the algorithm
nbTest.input.setTable(classifier.prediction.data,  xTest)
nbTest.input.setModel(classifier.prediction.model, model)

# Compute
predictions = nbTest.compute().get(classifier.prediction.prediction)

### Printing the confusion matrix
To see if the model has done a good job, we print the confusion matrix.

|                |           ||            Predicted Class             ||
| -------------- | --------- | --------- | --------- | ---- | --------- |
|                |           |  ** 1 **  |  ** 2 **  |  ... |  ** J **  |
|                |  ** 1 **  |  $n_{11}$ |  $n_{12}$ |  ... |  $n_{1J}$ |
|**Actual Class**|  ** 2 **  |  $n_{21}$ |  $n_{22}$ |  ... |  $n_{2J}$ |
|                |    ...    |    ...    |    ...    |  ... |    ...    |
|                |  ** J **  |  $n_{J1}$ |  $n_{J2}$ |  ... |  $n_{JJ}$ |

Here $n_{ij}$ is the number of samples that belong to actual class $i$, and predicted as the class $j$.

If the model does a perfect job then the diagonal elements of the matrix will dominate. As we'll see, it's not quite the case. But still the predictions are close to true values in many cases.

In [None]:
from daal.algorithms.multi_class_classifier import quality_metric_set
from daal.algorithms.classifier.quality_metric import multiclass_confusion_matrix
from daal.data_management import BlockDescriptor, readOnly

qualityMetricSet = quality_metric_set.Batch(nClasses)
input = qualityMetricSet.getInputDataCollection().getInput(quality_metric_set.confusionMatrix)

yTest2d = yTest.reshape(yTest.size, 1)
groundTruth = HomogenNumericTable(yTest2d.astype(np.float64))
input.set(multiclass_confusion_matrix.predictedLabels,   predictions)
input.set(multiclass_confusion_matrix.groundTruthLabels, groundTruth)

# Compute quality metrics and get the quality metrics
# returns ResultCollection class from daal.algorithms.multi_class_classifier.quality_metric_set
qualityMetricResult = qualityMetricSet.compute().getResult(quality_metric_set.confusionMatrix)
confusionMatrix = qualityMetricResult.get(multiclass_confusion_matrix.confusionMatrix)

bd = BlockDescriptor()
nrows = confusionMatrix.getNumberOfRows()
confusionMatrix.getBlockOfRows(0, nrows, readOnly, bd)
npa = np.copy(bd.getArray())
print(npa)
confusionMatrix.releaseBlockOfRows(bd)

qualityMetricsTable = qualityMetricResult.get(multiclass_confusion_matrix.multiClassMetrics)
qualityMetricsTable.getBlockOfRows(0, 1, readOnly, bd)
qualityMetricsData = bd.getArray().flatten()
print("Average accuracy: {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.averageAccuracy]))
print("Error rate:       {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.errorRate]))
print("Micro precision:  {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.microPrecision]))
print("Micro recall:     {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.microRecall]))
print("Micro F-score:    {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.microFscore]))
print("Macro precision:  {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.macroPrecision]))
print("Macro recall:     {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.macroRecall]))
print("Macro F-score:    {0:.3f}".format(qualityMetricsData[multiclass_confusion_matrix.macroFscore]))
qualityMetricsTable.releaseBlockOfRows(bd)


### Summary
In this lab, we learned a widely used algorithm for documents classification: Multinomial Naive Bayes. We saw how to apply it to the 20 Newsgroups dataset. We studied and practiced pyDAAL API for this algorithm.