In [10]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

# Load the Iris dataset
# The Iris dataset from sklearn is pre - embedded in the library, no need for additional downloads.
iris = load_iris()
# Features of the dataset
X = iris.data
# Target labels of the dataset
y = iris.target

# Split the dataset into training and testing sets
# test_size=0.3 means 30% of the data will be used as the test set.
# random_state=42 ensures reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [11]:
iris.DESCR



In [16]:
import pandas as pd

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
target_mapping = {i: name for i, name in enumerate(iris.target_names)}
iris_df['target_name'] = iris_df['target'].map(target_mapping)
print("DataFrame 基本信息：")
iris_df.info()
rows, columns = iris_df.shape
print("数据前几行内容信息：")
print(iris_df.head().to_csv(sep='\t', na_rep='nan'))
print("数据统计信息：")
print(iris_df.describe().to_csv(sep='\t', na_rep='nan'))

DataFrame 基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int32  
 5   target_name        150 non-null    object 
dtypes: float64(4), int32(1), object(1)
memory usage: 6.6+ KB
数据前几行内容信息：
	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target	target_name
0	5.1	3.5	1.4	0.2	0	setosa
1	4.9	3.0	1.4	0.2	0	setosa
2	4.7	3.2	1.3	0.2	0	setosa
3	4.6	3.1	1.5	0.2	0	setosa
4	5.0	3.6	1.4	0.2	0	setosa

数据统计信息：
	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
count	150.0	150.0	150.0	150.0	150.0
mean	5.843333333333334	3.0573333333333337	3.7580000000000005	1.199333333333

In [13]:
# Self - implemented Naive Bayes classifier
class BayesClassifier:
    def __init__(self):
        # Store the unique classes in the dataset
        self.classes = None
        # Store the mean values of each feature for each class
        self.mean = None
        # Store the variance values of each feature for each class
        self.var = None
        # Store the prior probabilities of each class
        self.priors = None

    def fit(self, X, y):
        # Get the number of samples and features in the dataset
        n_samples, n_features = X.shape
        # Find all unique classes in the target labels
        self.classes = np.unique(y)
        # Get the number of unique classes
        n_classes = len(self.classes)

        # Initialize arrays to store mean, variance, and prior probabilities
        self.mean = np.zeros((n_classes, n_features))
        self.var = np.zeros((n_classes, n_features))
        self.priors = np.zeros(n_classes)

        # Calculate the mean, variance, and prior probability for each class
        for idx, c in enumerate(self.classes):
            # Extract samples belonging to the current class
            X_c = X[y == c]
            # Calculate the mean of each feature for the current class
            self.mean[idx, :] = X_c.mean(axis=0)
            # Calculate the variance of each feature for the current class
            self.var[idx, :] = X_c.var(axis=0)
            # Calculate the prior probability of the current class
            self.priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        # Make predictions for each sample in the input data
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Store the posterior probabilities for each class
        posteriors = []

        # Calculate the posterior probability for each class
        for idx, c in enumerate(self.classes):
            # Calculate the log of the prior probability
            prior = np.log(self.priors[idx])
            # Calculate the log of the class - conditional probability
            class_conditional = np.sum(np.log(self._pdf(idx, x)))
            # Calculate the posterior probability
            posterior = prior + class_conditional
            posteriors.append(posterior)

        # Return the class with the maximum posterior probability
        return self.classes[np.argmax(posteriors)]
    
    def _pdf(self, class_idx, x):
        # Mean values of features for the specified class
        mean = self.mean[class_idx]
        # Variance values of features for the specified class
        var = self.var[class_idx]
        # Numerator of the Gaussian probability density function
        numerator = np.exp(- (x - mean) ** 2 / (2 * var))
        # Denominator of the Gaussian probability density function
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [14]:
# Create an instance of the Naive Bayes classifier
model = BayesClassifier()
# Train the classifier using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the predictions
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.67%
