In [63]:
# Naive Bayes Classification Algorithm

# ! Caution: Assumed that the dataset has no missing value.

# ? How the algorithm works ?
# Naive Bayes is a probabilistic machine learning algorithm based on Bayes' Theorem that is widely used for classification tasks.
# P(H\X) = P(X\H) * P(H)/P(X) where,
# P(H\X): Posterior probability of Hypothesis (Class)
# P(X\H): Posterior probability of Input X (Features)
# P(X): Probability of Input X (Class)
# P(H): Probability of Hypothesis (Features)
# 
# Steps of the algorithm:
# 1. Data Preparation
# 2. Calculate Class Probabilities (Prior Probabilities)
#    ├── Determine the total number of training examples
#    ├── Count the number of examples in each class
#    └── Calculate the probability of each class using the formula:
#        P(Class) = (Number of examples in the class) / (Total number of examples)
# 3. Calculate Conditional Probabilities
#    └── For each feature and each class, calculate the probability of the feature given the class
# 4. Prediction for a New Instance
#    └── For the new data point, calculate the probability for each class:
#        Use Bayes' Theorem: P(Class|Features) = (P(Features|Class) * P(Class)) / P(Features)
# 5. Classification Decision
#    ├── Compare the calculated probabilities for each class
#    ├── Select the class with the highest probability
#    └── This becomes the predicted class for the new instance

# The input data
input_data = [1,2,0,'Small']

# Initialize list for the raw dataset
raw_dataset = []

# Initialize list for the labels of dataset
labels = []

# Initialize list for the processed dataset
processed_dataset = []

# Determine the file path of dataset
file_path = '../Custom Datasets/naive-bayes-test.csv'

# Read all lines of the file
with open(file_path, 'r') as file:
  lines = file.readlines()

# Get the line where the labels reside
for line_number in range(0, len(lines)):
    if not lines[line_number].isspace():
        # Store the labels
        labels = lines.pop(line_number).strip().split(',')
        break

print(f"Labels:\n{labels}")
print()

# Store raw line in the dataset
for line in lines:
   if not line.isspace():
      # Store sample data after removing newline characters and splitting
      raw_dataset.append(line.strip().split(","))

print("Raw dataset:")
for sample in raw_dataset:
  print(sample)
print()

# Function for detecting float typed values
def is_float(string) -> bool:
  try:
    # If the value is integer typed, casting won't throw exception
    int(string)
    return False
  except:
    # The value is float typed, casting threw exception
    return True

# Control whether the value is numeric
def is_numeric(value) -> bool:
    try:
        # Try to cast
        float(value)
        # The value is either float or integer
        return True
    except:
        # The values is not numeric
        return False

# Process the raw data by type casting
for row in range(0, len(raw_dataset)):
  instance = []
  for col in range(0, len(labels)):
    value = raw_dataset[row][col]
    if is_numeric(value):
      if is_float(value):
        instance.append(float(value))
      else:
        instance.append(int(value))
    else:
        instance.append(value)
  processed_dataset.append(instance)

print("Processed dataset:")
for sample in processed_dataset:
  print(sample)
print()

# Count the samples of classes
# Key: class label, Value: class count
class_counts = dict()
for sample in processed_dataset:
  cl = sample[-1:].pop()
  if class_counts.get(cl) is not None:
    class_counts[cl] += 1
  else:
    class_counts[cl] = 1

print("Class counts:")
for cl in class_counts.keys():
   print(f"{cl}:\t{class_counts.get(cl)}")
print()

# Calculate the ratio of each class
# Key: Class label, Value: Ratio
class_ratios = dict()
for cl in class_counts.keys():
   if class_ratios.get(cl) is not None:
      class_ratios[cl] = round(class_counts.get(cl)/len(processed_dataset), ndigits=2)
   else:
      class_ratios[cl] = round(class_counts.get(cl)/len(processed_dataset), ndigits=2)

print("Class ratios:")
for cl in class_ratios.keys():
   print(f"{cl}:\t{class_ratios.get(cl)}")
print()

# Count the input features per classes
# Key: feature value, Value: dict of class labels and their inclusion counts
features_counts = dict()
for col in range(0, len(labels)-1): # Go through features except the class
   for row in range(0, len(processed_dataset)):
      feat = processed_dataset[row][col]
      cl = processed_dataset[row][-1]
      
      # If the feature matches with input feature
      if feat == input_data[col]:
        
        # Increment the count
        if features_counts.get(feat) is not None:
          if features_counts[feat].get(cl) is not None:
            features_counts[feat][cl] += 1
          else:
            features_counts[feat][cl] = 1
        else:
          features_counts[feat] = {cl:1}

print("Input feature counts:")
for feat in features_counts.keys():
   print(f"{feat}:\t{features_counts.get(feat)}")
print()

# Calculate the probabilities of the features on each classes
# Key: Feature, Value: dict of class labels and their probabilities
feature_probs = dict()
for feat,feat_class_counts in features_counts.items():
   total_feat_count = sum(feat_class_counts.values())
   for cl,cl_count in feat_class_counts.items():
    if feature_probs.get(feat) is not None:
      feature_probs[feat][cl] = round(cl_count/total_feat_count, ndigits=2)
    else:
      feature_probs[feat] = {cl:round(cl_count/total_feat_count, ndigits=2)}
      
print("Input feature probabilities:")
for feat in feature_probs.keys():
   print(f"{feat}:\t{feature_probs.get(feat)}")
print()

# Calculate class probabilities of each features
# Key: class label, Value: calculated probability
class_probs = dict()
for feat,feat_prob in feature_probs.items():
  for cl,prob in feat_prob.items():
    if class_probs.get(cl) is not None:
        class_probs[cl] = round(class_probs.get(cl)*prob, ndigits=3)
    else:
        class_probs[cl] = prob

print("Posterior probability of Features (P(X\\H)):")
for cl in class_probs.keys():
   print(f"{cl}:\t{class_probs.get(cl)}")
print()

# Calculate the probabilities of each classes
# Key: class label, Value: class posterior probability
post_class_prob = dict()
for cl,prob in class_probs.items():
   if post_class_prob.get(cl) is not None:
       post_class_prob[cl] = round(class_probs.get(cl)*class_ratios.get(cl), ndigits=3)
   else:
       post_class_prob[cl] = round(class_probs.get(cl)*class_ratios.get(cl), ndigits=3)

print("Posterior probability of Classes (P(H\\X)):")
for cl in post_class_prob.keys():
   print(f"{cl}:\t{post_class_prob.get(cl)}")
print()

# Get the key of specified unique value of the dictionary
def get_key_by_value(dict, value):
    for key, val in dict.items():
        if val == value:
            return key
    return None

# Assign the class with highest probability to the input
input_data.append(get_key_by_value(class_probs, max(class_probs.values())))

print(f"Classified input data:\n{input_data}\n")

Labels:
['A1', 'A2', 'A3', 'A4', 'Class']

Raw dataset:
['1', '2', '0', 'Small', '1']
['1', '3', '2', 'Mid', '1']
['1', '2', '2', 'Small', '1']
['0', '3', '4', 'Big', '0']
['0', '2', '3', 'Mid', '0']
['1', '3', '0', 'Small', '0']
['0', '1', '3', 'Big', '0']

Processed dataset:
[1, 2, 0, 'Small', 1]
[1, 3, 2, 'Mid', 1]
[1, 2, 2, 'Small', 1]
[0, 3, 4, 'Big', 0]
[0, 2, 3, 'Mid', 0]
[1, 3, 0, 'Small', 0]
[0, 1, 3, 'Big', 0]

Class counts:
1:	3
0:	4

Class ratios:
1:	0.43
0:	0.57

Input feature counts:
1:	{1: 3, 0: 1}
2:	{1: 2, 0: 1}
0:	{1: 1, 0: 1}
Small:	{1: 2, 0: 1}

Input feature probabilities:
1:	{1: 0.75, 0: 0.25}
2:	{1: 0.67, 0: 0.33}
0:	{1: 0.5, 0: 0.5}
Small:	{1: 0.67, 0: 0.33}

Posterior probability of Features (P(X\H)):
1:	0.169
0:	0.014

Posterior probability of Classes (P(H\X)):
1:	0.073
0:	0.008

Classified input data:
[1, 2, 0, 'Small', 1]

