forked from ddbourgin/numpy-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
214 lines (172 loc) · 7.64 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""A module for naive Bayes classifiers"""
import numpy as np
class GaussianNBClassifier:
def __init__(self, eps=1e-6):
r"""
A naive Bayes classifier for real-valued data.
Notes
-----
The naive Bayes model assumes the features of each training example
:math:`\mathbf{x}` are mutually independent given the example label
*y*:
.. math::
P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
where :math:`M` is the rank of the :math:`i^{th}` example
:math:`\mathbf{x}_i` and :math:`y_i` is the label associated with the
:math:`i^{th}` example.
Combining the conditional independence assumption with a simple
application of Bayes' theorem gives the naive Bayes classification
rule:
.. math::
\hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
&= \arg \max_y P(y) P(\mathbf{x} \mid y) \\
&= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y)
In the final expression, the prior class probability :math:`P(y)` can
be specified in advance or estimated empirically from the training
data.
In the Gaussian version of the naive Bayes model, the feature
likelihood is assumed to be normally distributed for each class:
.. math::
\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
where :math:`\theta` is the set of model parameters: :math:`\{\mu_1,
\Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of
unique classes present in the data, and the parameters for the Gaussian
associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
(where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
training examples with label :math:`c`.
Parameters
----------
eps : float
A value added to the variance to prevent numerical error. Default
is 1e-6.
Attributes
----------
parameters : dict
Dictionary of model parameters: "mean", the `(K, M)` array of
feature means under each class, "sigma", the `(K, M)` array of
feature variances under each class, and "prior", the `(K,)` array of
empirical prior probabilities for each class label.
hyperparameters : dict
Dictionary of model hyperparameters
labels : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
An array containing the unique class labels for the training
examples.
"""
self.labels = None
self.hyperparameters = {"eps": eps}
self.parameters = {
"mean": None, # shape: (K, M)
"sigma": None, # shape: (K, M)
"prior": None, # shape: (K,)
}
def fit(self, X, y):
"""
Fit the model parameters via maximum likelihood.
Notes
-----
The model parameters are stored in the :py:attr:`parameters
<numpy_ml.linear_models.GaussianNBClassifier.parameters>` attribute.
The following keys are present:
"mean": :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature means for each of the `K` label classes
"sigma": :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature variances for each of the `K` label classes
"prior": :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
Prior probability of each of the `K` label classes, estimated
empirically from the training data
Parameters
----------
X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`
y: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
The class label for each of the `N` examples in `X`
Returns
-------
self : :class:`GaussianNBClassifier <numpy_ml.linear_models.GaussianNBClassifier>` instance
""" # noqa: E501
P = self.parameters
H = self.hyperparameters
self.labels = np.unique(y)
K = len(self.labels)
N, M = X.shape
P["mean"] = np.zeros((K, M))
P["sigma"] = np.zeros((K, M))
P["prior"] = np.zeros((K,))
for i, c in enumerate(self.labels):
X_c = X[y == c, :]
P["mean"][i, :] = np.mean(X_c, axis=0)
P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
P["prior"][i] = X_c.shape[0] / N
return self
def predict(self, X):
"""
Use the trained classifier to predict the class label for each example
in **X**.
Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
Returns
-------
labels : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
The predicted class labels for each example in `X`
"""
return self.labels[self._log_posterior(X).argmax(axis=1)]
def _log_posterior(self, X):
r"""
Compute the (unnormalized) log posterior for each class.
Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
Returns
-------
log_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
Unnormalized log posterior probability of each class for each
example in `X`
"""
K = len(self.labels)
log_posterior = np.zeros((X.shape[0], K))
for i in range(K):
log_posterior[:, i] = self._log_class_posterior(X, i)
return log_posterior
def _log_class_posterior(self, X, class_idx):
r"""
Compute the (unnormalized) log posterior for the label at index
`class_idx` in :py:attr:`labels <numpy_ml.linear_models.GaussianNBClassifier.labels>`.
Notes
-----
Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
:math:`c` is::
.. math::
\log P(y_i = c \mid \mathbf{x}_i, \theta)
&\propto \log P(y=c \mid \theta) +
\log P(\mathbf{x}_i \mid y_i = c, \theta) \\
&\propto \log P(y=c \mid \theta)
\sum{j=1}^M \log P(x_j \mid y_i = c, \theta)
In the Gaussian naive Bayes model, the feature likelihood for class
:math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
be normally distributed
.. math::
\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
class_idx : int
The index of the current class in :py:attr:`labels`
Returns
-------
log_class_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
Unnormalized log probability of the label at index `class_idx`
in :py:attr:`labels <numpy_ml.linear_models.GaussianNBClassifier.labels>`
for each example in `X`
""" # noqa: E501
P = self.parameters
mu = P["mean"][class_idx]
prior = P["prior"][class_idx]
sigsq = P["sigma"][class_idx]
# log likelihood = log X | N(mu, sigsq)
log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
return log_likelihood + np.log(prior)