## Project 1: Density Estimation and Classification - Code

### Preparation

In [1]:
import numpy
import scipy.io
import math
import geneNewData


myID = "5486"  # your ID here
geneNewData.geneData(myID)
Numpyfile0 = scipy.io.loadmat("digit0_stu_train" + myID + ".mat")
Numpyfile1 = scipy.io.loadmat("digit1_stu_train" + myID + ".mat")
Numpyfile2 = scipy.io.loadmat("digit0_testset" + ".mat")
Numpyfile3 = scipy.io.loadmat("digit1_testset" + ".mat")
train0 = Numpyfile0.get("target_img")
train1 = Numpyfile1.get("target_img")
test0 = Numpyfile2.get("target_img")
test1 = Numpyfile3.get("target_img")
print([len(train0), len(train1), len(test0), len(test1)])
print("Your trainset and testset are generated successfully!")

[5000, 5000, 980, 1135]
Your trainset and testset are generated successfully!


### Task 1: Feature extraction


Assume features are independent of eachother and images are drawn from normal distribution

- feature 1: average brightness of the image 
- feature 2: standard deviation of the brightness

$$p(f_1 | \,f_2, \text{digit}) = p(f_1 | \,\text{digit})$$
$$p(f_2 | \,f_1, \text{digit}) = p(f_2 | \,\text{digit})$$



In [2]:
feature1 = lambda img: numpy.mean(img)
feature2 = lambda img: numpy.std(img)

In [3]:
train0_features = numpy.array([(feature1(img), feature2(img)) for img in train0])
train1_features = numpy.array([(feature1(img), feature2(img)) for img in train1])

train0_features.dtype, train0_features.shape, train1_features.shape

(dtype('float64'), (5000, 2), (5000, 2))

### Task 2: Parameter calculation

$$\begin{array}{cc}\hat{\mu} = \frac{ \sum_i{x_i}}{n} && \widehat{\sigma^2} = \frac{\sum_i{(x_i - \mu)^2}}{n} \end{array}$$



In [4]:
mean0_f1, var0_f1 = numpy.mean(train0_features[:, 0]), numpy.var(train0_features[:, 0])
mean0_f2, var0_f2 = numpy.mean(train0_features[:, 1]), numpy.var(train0_features[:, 1])
mean1_f1, var1_f1 = numpy.mean(train1_features[:, 0]), numpy.var(train1_features[:, 0])
mean1_f2, var1_f2 = numpy.mean(train1_features[:, 1]), numpy.var(train1_features[:, 1])

print(f"Mean_of_feature1_for_digit0: {mean0_f1}\nVariance_of_feature1_for_digit0: {var0_f1}\n")
print(f"Mean_of_feature2_for_digit0: {mean0_f2}\nVariance_of_feature2_for_digit0: {var0_f2}\n")
print(f"Mean_of_feature1_for_digit1: {mean1_f1}\nVariance_of_feature1_for_digit1: {var1_f1}\n")
print(f"Mean_of_feature2_for_digit1: {mean1_f2}\nVariance_of_feature2_for_digit1: {var1_f2}\n")


Mean_of_feature1_for_digit0: 44.20389362244898
Variance_of_feature1_for_digit0: 116.7009513315956

Mean_of_feature2_for_digit0: 87.41907699909902
Variance_of_feature2_for_digit0: 102.46490855207279

Mean_of_feature1_for_digit1: 19.43962244897959
Variance_of_feature1_for_digit1: 31.98890083543315

Mean_of_feature2_for_digit1: 61.44770668123185
Variance_of_feature2_for_digit1: 83.54555141482662



### Task 3: Naïve Bayes Classifier implementation & label predictions for test data

In [5]:
test0_features = numpy.array([(feature1(img), feature2(img)) for img in test0])
test1_features = numpy.array([(feature1(img), feature2(img)) for img in test1])

In [6]:
train0_len, train1_len = len(train0_features), len(train1_features)
train_len = train0_len + train1_len
prior0, prior1 = train0_len / train_len, train1_len / train_len

#### Normal probability density function

$${\displaystyle Norm(x, \mu, \sigma^2) = \dfrac{1}{\sqrt{2\pi\sigma^2}} {exp}^{- \dfrac{{(x-\mu)^2}}{{2\sigma^2}}} }$$

In [7]:
def normal_pdf(x, mean, var):
    d = 1 / numpy.sqrt(2 * numpy.pi * var)
    n = numpy.exp(-((x - mean) ** 2) / (2 * var))
    return d * n

#### Bayesian probability

$${\displaystyle p(C_{k}\mid \mathbf {x} )={\frac {p(C_{k})\ p(\mathbf {x} \mid C_{k})}{p(\mathbf {x} )}}\, \Leftrightarrow \, \text{posterior}={\frac {{\text{prior}}\times {\text{likelihood}}}{\text{evidence}}} }$$


$$
\begin{array}{rll}
\text{prior} \times \text{likelihood} \Leftrightarrow (C_{k},x_{1},\ldots ,x_{n}) &= p(x_{1},\ldots ,x_{n},C_{k}) \\
&= p(x_{1}\mid x_{2},\ldots ,x_{n},C_{k}) \cdot p(x_{2},\ldots ,x_{n},C_{k}) \\
&= p(x_{1}\mid x_{2},\ldots ,x_{n},C_{k}) \cdot p(x_{2}\mid x_{3},\ldots ,x_{n},C_{k}) \cdot p(x_{3},\ldots ,x_{n},C_{k}) \\
&= \cdots \\
&= p(x_{1}\mid x_{2},\ldots ,x_{n},C_{k}) \cdot p(x_{2}\mid x_{3},\ldots ,x_{n},C_{k}) \cdots p(x_{n-1}\mid x_{n},C_{k}) \cdot p(x_{n}\mid C_{k}) \cdot p(C_{k}) 
\end{array}
$$

#### Naive conditional independence

$$
\begin{array}{lll}
p(x_{i}\mid x_{i+1},\ldots ,x_{n},C_{k})=p(x_{i}\mid C_{k}) \Rightarrow & p(C_{k}\mid x_{1},\ldots ,x_{n}) &\propto p(C_{k},x_{1},\ldots ,x_{n}) \\
&&\propto p(C_{k}) \cdot p(x_{1}\mid C_{k}) \cdot p(x_{2}\mid C_{k}) \cdot p(x_{3}\mid C_{k}) \cdots \\
&&\propto p(C_{k}) \prod p(x_{i}\mid C_{k})
\end{array}
$$
$$
\begin{array}{lll}
\text{posterior} \Leftrightarrow p(C_{k}\mid x_{1},\ldots ,x_{n}) = \dfrac{ p(C_{k}) \prod p(x_{i}\mid C_{k})}{p(\mathbf{x})} & \text{where} & p(\mathbf{x}) = \sum p(C_{k}) \cdot p(\mathbf{x} \mid C_{k}) \Leftrightarrow \text{evidence}
\end{array}
$$

#### Classifier



$${\widehat {C_k}}={\underset {k}{\operatorname {argmax} }}\ p(C_{k})\displaystyle \prod_i p(x_{i}\mid C_{k})\, = \, {\underset {k}{\operatorname {argmax} }}\, \{\text{posterior}\} $$



In [8]:
def classifier(test_features, prior0, prior1):
    labels = []
    for test_f1, test_f2 in test_features:
        likelihood0 = normal_pdf(test_f1, mean0_f1, var0_f1) * normal_pdf(test_f2, mean0_f2, var0_f2)
        likelihood1 = normal_pdf(test_f1, mean1_f1, var1_f1) * normal_pdf(test_f2, mean1_f2, var1_f2)
        evidence = likelihood0 * prior0 + likelihood1 * prior1

        posterior0 = (likelihood0 * prior0) / evidence
        posterior1 = (likelihood1 * prior1) / evidence
        classes = numpy.array([posterior0, posterior1])
        labels.append(classes.argmax())

    return labels

In [9]:
test0_labels = classifier(test0_features, prior0, prior1)
test1_labels = classifier(test1_features, prior0, prior1)

test0_labels.count(0), test1_labels.count(1)

(899, 1048)

### Task 4: Accuracy calculation

In [10]:
Accuracy_for_digit0testset = test0_labels.count(0) / len(test0_labels)
Accuracy_for_digit1testset = test1_labels.count(1) / len(test1_labels)

print(f"Accuracy_for_digit0testset: {Accuracy_for_digit0testset}")
print(f"Accuracy_for_digit1testset: {Accuracy_for_digit1testset}")

Accuracy_for_digit0testset: 0.9173469387755102
Accuracy_for_digit1testset: 0.9233480176211454
