In [16]:
import numpy as np

In [17]:
def fit(x_train, y_train):
    result={}
    class_values = set(y_train)
    for current_class in class_values:
        result[current_class] = {}
        result["total_data"] = len(y_train)
        current_class_rows = (y_train == current_class)
        x_train_current = x_train[current_class_rows]
        y_train_current = y_train[current_class_rows]
        result[current_class]["total_count"] = len(y_train_current)
        num_features = x_train.shape[1]
        for j in range(1,num_features+1):
            result[current_class][j] = {}
            all_possible_values = set(x_train[:,j-1])
            for current_row in all_possible_values:
                result[current_class][j][current_row] = (x_train_current[:, j-1] == current_row).sum()
    return result

In [18]:
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    num_features = len(dictionary[current_class]) - 1
    for j in range(1, num_features+1):
        xj = x[j-1]
        count_current_class_with_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        current_xj_probability = np.log(count_current_class_with_xj) - np.log(count_current_class)
        output = output + current_xj_probability
    return output

In [19]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if(current_class == "total_data"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if(first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [20]:
def predict(dictionary, x_test):
    y_pred = []
    for i in x_test:
        result_class = predictSinglePoint(dictionary, i)
        y_pred.append(result_class)
    return y_pred

In [21]:
def makeLabelled(column):
    limit2 = column.mean()
    limit1 = 0.5*limit2
    limit3 = 1.5*limit2
    for i in range(0, len(column)):
        if(column[i]<limit1):
            column[i]=0
        elif(column[i]<limit2):
            column[i]=1
        elif(column[i]<limit3):
            column[i]=2
        else:
            column[i]=3
    return column
            

In [22]:
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target

In [23]:
for i in range(0, x.shape[-1]):
    x[:,i] = makeLabelled(x[:,1])

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [29]:
dictionary = fit(x_train, y_train)

In [30]:
y_pred = predict(dictionary, x_test)

In [31]:
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.73      0.85      0.79        13
          1       0.61      0.88      0.72        16
          2       0.00      0.00      0.00         9

avg / total       0.51      0.66      0.57        38

[[11  2  0]
 [ 2 14  0]
 [ 2  7  0]]


  'precision', 'predicted', average, warn_for)
