In [1]:
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd
from math import pi, exp, sqrt
%matplotlib inline

In [2]:
input_path = "Example.tsv"
input_ds = pd.read_csv(input_path, header = None, sep="\t")

In [3]:
input_ds.head()

Unnamed: 0,0,1,2,3
0,A,-1.525735,1.67408,
1,A,6.069158,5.152899,
2,A,-4.146633,1.081567,
3,A,-4.994697,2.974933,
4,A,-0.837513,-1.655131,


In [4]:
input_ds = input_ds.dropna(axis = 'columns')

In [5]:
input_ds.head()

Unnamed: 0,0,1,2
0,A,-1.525735,1.67408
1,A,6.069158,5.152899
2,A,-4.146633,1.081567
3,A,-4.994697,2.974933
4,A,-0.837513,-1.655131


In [6]:
n_cols = len(input_ds.columns)
print("The number of columns:", n_cols)

The number of columns: 3


In [7]:
col_name = []
for i in range(n_cols):
    if i == 0:
        col_name.append('labels')
    else:
        col_name.append('x_{}'.format(i-1))

print(col_name)

['labels', 'x_0', 'x_1']


In [8]:
input_ds.columns = col_name
input_ds.head()

Unnamed: 0,labels,x_0,x_1
0,A,-1.525735,1.67408
1,A,6.069158,5.152899
2,A,-4.146633,1.081567
3,A,-4.994697,2.974933
4,A,-0.837513,-1.655131


In [9]:
vals, counts = np.unique(input_ds['labels'],return_counts = True)
print(vals)
print(counts)

['A' 'B']
[200 200]


In [10]:
c = np.asarray(input_ds['labels'])
print(c[:5])

['A' 'A' 'A' 'A' 'A']


In [11]:
x = input_ds.iloc[:,1:n_cols]
x = np.asarray(x)
print(x[:5,:])
print("Shape of x:", x.shape)

[[-1.525735  1.67408 ]
 [ 6.069158  5.152899]
 [-4.146633  1.081567]
 [-4.994697  2.974933]
 [-0.837513 -1.655131]]
Shape of x: (400, 2)


In [12]:
j = ['a','b','c']
for i in j:
    print(i)
# example of for loop in List

a
b
c


In [13]:
def prob_class(class_col,counts):
    class_probs = []
    for count in counts:
        class_prob = count/len(class_col)
        class_probs.append(class_prob)
    return class_probs

In [14]:
p_class = prob_class(c,counts)

In [15]:
print(p_class)

[0.5, 0.5]


In [16]:
def retrieve_class_index(class_cols, vals):
    index_list = []
    for val in vals:
        index_array = np.where(class_cols == val)
        index_list.append(index_array)
    return index_list

def mean(x,index_list, counts):
    class_means = []
    for i in range(len(index_list)):
            x_sum = np.sum([x[idx] for idx in index_list[i]], axis = 1)
            class_mean = x_sum/counts[i]
            class_means.append(class_mean)
    return np.squeeze(np.asarray(class_means))

In [24]:
def variance(x,index_list, mean_arr, counts):
    variance_list = []
    for i in range(len(index_list)):
        x_class = np.array([x[idx] for idx in index_list[i]])
        sum_func = np.sum((x_class-mean_arr[i])**2,axis = 1)
        variance_class = sum_func/(counts[i]-1.0)
        variance_list.append(variance_class)
    return np.squeeze(np.asarray(variance_list))

In [28]:
def likelihood(x, mean, variance):
    denominator = sqrt(2*pi*variance)
    exp_term = exp(-(((x-mean)**2)/(2*variance)))
    return exp_term/denominator

In [19]:
for i in range(len(x)):
    for j in range(x.shape[1]):
        print(x[i,j])
    break

-1.525735
1.67408


In [25]:
index_list = retrieve_class_index(c, vals)
prior_arr = prob_class(c, counts)
mean_arr = mean(x,index_list,counts)
variance_arr = variance(x, index_list, mean_arr, counts)

In [23]:
print(prior_arr)

[0.5, 0.5]


In [21]:
print(mean_arr)

[[ 0.95684796  1.18431146]
 [ 2.47305036 -3.65505485]]


In [26]:
print(variance_arr)

[[17.38673134  7.88694462]
 [ 1.32068366  6.85986019]]


In [29]:
likelihood_01 = likelihood(x[0,0], mean_arr[0,0], variance_arr[0,0])
likelihood_02 = likelihood(x[0,1], mean_arr[0,1], variance_arr[0,1])
Probability_A = prior_arr[0] * likelihood_01 * likelihood_02
print(Probability_A)

0.005605940528130784


In [31]:
likelihood_03 = likelihood(x[0,0], mean_arr[1,0], variance_arr[1,0])
likelihood_04 = likelihood(x[0,1], mean_arr[1,1], variance_arr[1,1])
Probability_B = prior_arr[1] * likelihood_03 * likelihood_04
print(Probability_B)

7.836465940061488e-06


In [34]:
n_class = mean_arr.shape[0]
n_att = mean_arr.shape[1]
print(n_class)
print(n_att)

2
2


In [38]:
#build a list to save the result of classification
argmax_list = []
#1st loop run the rows of x[row,attribute]
for i in range(len(x)):
    posterior_list = [] 
    #2nd loop to calculate probabilities of each class 
    for cls in range(n_class):
        total_likelihood = 1
        #3rd loop to change different attributes
        for att in range(n_att):
            total_likelihood *= likelihood(x[i, att], mean_arr[cls, att], variance_arr[cls,att]) 
            
        posterior_list.append(prior_arr[cls] * total_likelihood)
    argmax = np.argmax(posterior_list)
    argmax_list.append(argmax)
print(argmax_list)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 

In [39]:
#transfer the list to a array
argmax_list = np.array(argmax_list)
argmax_list.shape

(400,)

In [40]:
map_idx2class = np.array([vals[idx] for idx in argmax_list])
print(map_idx2class)

['A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'B' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'B' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'B'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'B' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A'
 'A' 'B' 'B' 'B' 'B' 'B' 'A' 'B' 'A' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B

In [41]:
misclassification_list = []
for i in range(len(c)):
    if map_idx2class[i] != c[i]:
        misclassification_list.append(False)

In [43]:
print(misclassification_list)

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]


In [50]:
n_mis = misclassification_list.count(False)

In [52]:
output_total = []
for cls in range(n_class):
    output_total_list = []
    for att in range(n_att):
        output_total_list.append(mean_arr[cls,att])
        output_total_list.append(variance_arr[cls,att])
    output_total_list.append(prior_arr[cls])
    output_total.append(output_total_list)
output_total = np.array(output_total)
print(output_total)

[[ 0.95684796 17.38673134  1.18431146  7.88694462  0.5       ]
 [ 2.47305036  1.32068366 -3.65505485  6.85986019  0.5       ]]


In [53]:
output_path = "My_Solution.tsv"
with open(output_path,'wt') as write_tsv:
    tsv_writer = csv.writer(write_tsv,delimiter='\t')
    for i in range(len(output_total)):
        tsv_writer.writerow(output_total[i])
    tsv_writer.writerow([n_mis])
write_tsv.close()