# Individual Assignment 2 - Task 1

In [1]:
import numpy as np
import pandas as pd

## Load the dataset

In [14]:
# Load the data
iris_data = pd.read_csv("iris.data", header=None)

## Preprocessing

In [15]:
# Convert class labels to numerical values
class_mapping = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
iris_data[4] = iris_data[4].map(class_mapping)

## Stratified sampling

In [16]:
# Stratified sampling for train-test split
# Select ~70% for training and ~30% for test
def stratified_sampling(data, test_ratio=0.3):
    classes = data[4].unique()
    test_indices = []
    for cls in classes:
        cls_indices = data[data[4] == cls].index
        test_size = int(len(cls_indices) * test_ratio)
        test_indices.extend(np.random.choice(cls_indices, test_size, replace=False))
    train_indices = data.index.difference(test_indices)
    return train_indices, test_indices

train_indices, test_indices = stratified_sampling(iris_data)

# Split the data into training and testing sets
train_data = iris_data.iloc[train_indices]
test_data = iris_data.iloc[test_indices]

# Separate features and labels
X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
X_test, y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]


## Modelling

In [7]:
# Naive Bayes Classifier
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = None
        self.feature_probs = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Calculate class probabilities
        self.class_probs = np.zeros(n_classes)
        for i, cls in enumerate(self.classes):
            self.class_probs[i] = np.sum(y == cls) / n_samples

        # Calculate feature probabilities
        self.feature_probs = []
        for cls in self.classes:
            cls_data = X[y == cls]
            cls_feature_probs = []
            for feature in range(n_features):
                feature_mean = np.mean(cls_data[:, feature])
                feature_std = np.std(cls_data[:, feature])
                cls_feature_probs.append((feature_mean, feature_std))
            self.feature_probs.append(cls_feature_probs)

    def predict(self, X):
        preds = []
        for x in X:
            cls_probs = []
            for i, cls in enumerate(self.classes):
                cls_prob = np.log(self.class_probs[i])
                for j, feature in enumerate(x):
                    mean, std = self.feature_probs[i][j]
                    cls_prob += np.log(self.gaussian_prob(feature, mean, std))
                cls_probs.append(cls_prob)
            preds.append(np.argmax(cls_probs))
        return preds

    @staticmethod
    def gaussian_prob(x, mean, std):
        exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
        return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

## Test the model

In [18]:
# Train the Naive Bayes classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train.values, y_train.values)

# Train set
train_pred = nb_classifier.predict(X_train.values)
train_accuracy = np.mean(train_pred == y_train.values)
print("Train set Accuracy:", train_accuracy)

# Test set
test_pred = nb_classifier.predict(X_test.values)
accuracy = np.mean(test_pred == y_test.values)
print("Test set Accuracy:", accuracy)

Train set Accuracy: 0.9523809523809523
Test set Accuracy: 0.9777777777777777


### Result:
* The Naive Bayes Classifier achieves a training set accuracy of approximately **95.24%** and a test set accuracy of approximately **97.78%**. These results indicate that the classifier generalizes well to unseen data, demonstrating its effectiveness in accurately classifying Iris flower samples based on their features.

### Conclusion:
* In conclusion, the implementation of the Gaussian Naive Bayes Classifier demonstrates the effectiveness of the algorithm in handling classification tasks, particularly with datasets containing continuous features following Gaussian distributions. By leveraging the principles of Bayes' theorem and assuming feature independence, the classifier provides a simple yet robust solution for classification problems.