In [1]:
import numpy as np

In [18]:
def fit(X_train,Y_train):
    result={}
    class_values=set(Y_train)
    for current_class in class_values:
        result[current_class]={}
        result["total_data"]=len(Y_train)
        current_class_rows = (Y_train == current_class)
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        num_features=X_train.shape[1] # total number of features (Eg : Weather , Parents , Money) i.e 3
        result[current_class]["total_count"]=len(Y_train_current)
        for j in range(1,num_features+1):
            result[current_class][j]={}
            all_possible_values=set(X_train[:,j-1]) # total number of unique values in a column (Eg : For Weather , we have : Sunny , Windy and Rainy)
            for current_value in all_possible_values:
                result[current_class][j][current_value]=(X_train_current[:,j-1] == current_value).sum()
    return result

In [21]:
def log_probability(dictionary,x,current_class):
    output=np.log(dictionary[current_class]["total_count"])-np.log(dictionary["total_data"])
    num_features=len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj=x[j-1]
        count_current_class_with_value_xj=dictionary[current_class][j][xj] + 1 # apply laplace correction
        count_current_class=dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        current_xj_probability=np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output=output+current_xj_probability
    return output # once we have done for all the features , we return output

In [8]:
def probability(dictionary,x,current_class):
    output=dictionary[current_class]["total_count"]/dictionary["total_data"]
    num_features=len(dictionary[current_class].keys())-1
    for j in range(1,num_features+1):
        xj=x[j-1]
        count_current_class_with_value_xj=dictionary[current_class][j][xj] + 1 # apply laplace correction
        count_current_class=dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        current_xj_probability=count_current_class_with_value_xj/count_current_class
        output=output*current_xj_probability
    return output # once we have done for all the features , we return output

In [22]:
# x is the single point from X_test
def predictSinglePoint(dictionary,x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True # in the first run, I am definately going to update the best_class 
    for current_class in classes:
        if current_class == "total_data":
            continue
        p_current_class=log_probability(dictionary,x,current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run=False
    return best_class

In [23]:
# X_test is the input for this we have to predict the output
def predict(dictionary,X_test):
    y_pred=[]
    for x in X_test:
        predicted_class=predictSinglePoint(dictionary,x)
        y_pred.append(predicted_class)
    return y_pred

# Change continous data into labeled data

In [11]:
def makeLabeled(column):
    second_limit=column.mean()
    first_limit=0.5*column.mean()
    third_limit=1.5*column.mean()
    for i in range(0,len(column)):
        if(column[i]<first_limit):
            column[i]=0
        elif(column[i]<second_limit):
            column[i]=1
        elif(column[i]<third_limit):
            column[i]=2
        else:
            column[i]=3
    return column

In [12]:
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
Y=iris.target

## Tranforming Data from Continous to Labeled

In [14]:
for i in range(0,X.shape[-1]):
    X[:,i]=makeLabeled(X[:,i])

In [15]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test=model_selection.train_test_split(X,Y,test_size=0.25,random_state=0)

In [19]:
dictionary=fit(X_train,Y_train)

In [24]:
Y_pred=predict(dictionary,X_test)

In [25]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [27]:
classes=dictionary.keys()
classes

dict_keys([0, 'total_data', 1, 2])