In [1]:
import numpy as np

In [26]:
def fit(X_train, Y_train):
    
    # here result is a dictionary
    # which will store the distint TARGET Values to work upon
    
    result = {}
    class_values = set(Y_train)
    
    # current_values will have all the distinct values in TARGET (Y_train)
    
    for current_class in class_values:
        
        # here each value in result dictionary will store 
        # another dictionary to count the number of rows belonging to the that class
        
        result[current_class] = {}
        
        # we are storing total data points which are in set
        
        result["total_data_points"] = len(Y_train)
        
        # we are counting the number of rows related to current class and storing in form of TRUE and FALSE
        
        current_class_rows = (Y_train == current_class)
        
        # getting all the X train and Y train related to the current class
        
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        
        # counting the number of features in X train
        
        num_features = X_train.shape[1]
        
        # here it will store the total rows belonging to the current class
        
        result[current_class]["total_count"] = len(Y_train_current)
        for j in range(1, num_features+1):
            result[current_class][j] = {}
            
            # here it is finding the all distinct values in feature 'j'
            
            all_posible_values = set(X_train[:, j-1])
            for current_values in all_posible_values:
                
                # we are storing the total count of the every value which is present in the 'Jth' column 
                
                result[current_class][j][current_values] = (X_train_current[:, j-1] == current_values).sum()
    return result
        
        

In [31]:
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data_points"])
    num_features = len(dictionary[current_class].keys()) - 1
    for j in range(1, num_features +1):
        xj = x[j-1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        current_xj_probability = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
        output = output + current_xj_probability
    return output

In [12]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if(current_class == "total_data_points"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if(first_run or (p_current_class > best_p)):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [29]:
def predict(dictionary, X_test):
    y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [16]:
def makeLable(columns): 
    second_limit = columns.mean()
    first_limit = 0.5*second_limit
    third_limit = 1.5*second_limit
    for i in range(0, len(columns)):
        if(columns[i] < first_limit):
            columns[i] = 0
        elif(columns[i] < second_limit):
            columns[i]  = 1
        elif(columns[i] < third_limit):
            columns[i] = 2
        else:
            columns[i] = 3
    return columns

In [20]:
from sklearn import datasets

iris = datasets.load_iris()

In [19]:
X = iris.data
Y = iris.target

In [21]:
for i in range(0, X.shape[-1]):
    X[:, i] = makeLable(X[:, i])

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)


In [27]:
dictionary = fit(X_train, Y_train)

In [32]:
Y_pred = predict(dictionary, X_test)

In [36]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Y_test, Y_pred)

array([[14,  0,  0],
       [ 0,  9,  0],
       [ 0,  1, 14]])