In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target


# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize Gaussian Naive Bayes classifier
classifier = GaussianNB()


# Train the classifier
classifier.fit(X_train, y_train)


# Predictions on the test set
y_pred = classifier.predict(X_test)


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [2]:
# Apply the necessary preprocessing techniques on IRIS dataset.

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In addition to Gaussian Naive Bayes, which assumes that features follow a normal (Gaussian) distribution, there are other variants of Naive Bayes classifiers that are commonly used for different types of data distributions. Some of the other variants of Naive Bayes classifiers include:

Multinomial Naive Bayes:

Suitable for text classification tasks where features represent word counts or frequency of occurrence (e.g., bag-of-words model).
Assumes that features are generated from a multinomial distribution.
Often used in natural language processing (NLP) tasks such as document classification and spam filtering.

Bernoulli Naive Bayes:

Similar to Multinomial Naive Bayes but specifically designed for binary feature vectors (i.e., presence or absence of a feature).
Assumes that features are binary (e.g., presence or absence of words in a document).
Also commonly used in text classification tasks, particularly when using binary feature representations.

Complement Naive Bayes:

Variant of Multinomial Naive Bayes that is designed to address class imbalance problems in text classification.
Computes statistics complementary to those of the original class to adjust for imbalanced class distributions.
Particularly effective for text classification tasks with highly imbalanced class distributions.

Categorical Naive Bayes:

Extension of Multinomial Naive Bayes that is suitable for categorical features with a fixed number of categories.
Assumes that features are drawn from a categorical distribution.
Useful for classification tasks involving categorical data, such as customer segmentation or product recommendation.

These variants of Naive Bayes classifiers offer different assumptions about the underlying data distributions and are suitable for different types of data. It's essential to choose the appropriate variant based on the characteristics of the dataset and the nature of the classification problem.

In [3]:
# Identify other variants of Naive Bayes classifier.

from sklearn.naive_bayes import MultinomialNB

# Initialize Multinomial Naive Bayes classifier
classifier = MultinomialNB()
# replace Multinomial with Complement, Bernoulli, Categorical for others

In [4]:
# Compute other metric values using confusion matrix.

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

# Assuming y_true and y_pred contain the true and predicted labels, respectively
conf_matrix = confusion_matrix(y_test, y_pred)

# Compute precision, recall, F1-score, and support
precision, recall, f1_score, support = precision_recall_fscore_support(y_test, y_pred)

# Print the metrics
for i in range(len(precision)):
    print(f"Class {i}: Precision={precision[i]}, Recall={recall[i]}, F1-score={f1_score[i]}, Support={support[i]}")


Class 0: Precision=1.0, Recall=1.0, F1-score=1.0, Support=10
Class 1: Precision=1.0, Recall=1.0, F1-score=1.0, Support=9
Class 2: Precision=1.0, Recall=1.0, F1-score=1.0, Support=11


In [5]:
#Apply feature engineering and then apply Naive Bayes classifier and print its performance measures.

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature engineering (if needed)
# For example, you can apply scaling, normalization, or create new features here

# Initialize Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [6]:
# Print AUC-ROC curve.

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Assuming y_test and y_pred_proba are defined
# y_test: True labels of the test set
# y_pred_proba: Predicted probabilities for the positive class

# Assuming classifier is your trained classifier or model
# Predict probabilities for the positive class
y_pred_proba = classifier.predict_proba(X_test)[:, 1]

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Compute AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


ValueError: multiclass format is not supported

In [None]:
#Compute the covariance matrix of iris dataset and use it in classification.


from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import numpy as np

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Compute covariance matrix
cov_matrix = np.cov(X.T)  # Transpose X to compute covariance matrix of features

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy without using covariance matrix:", accuracy)

# Update classifier with covariance matrix
classifier.sigma_ = cov_matrix  # Update covariance matrix in the classifier

# Predictions on the test set with updated classifier
y_pred_cov = classifier.predict(X_test)

# Calculate accuracy with updated classifier
accuracy_cov = accuracy_score(y_test, y_pred_cov)
print("Accuracy with covariance matrix:", accuracy_cov)


In [None]:
#Print the statistics of every feature of iris dataset.

import pandas as pd
from sklearn.datasets import load_iris

# Load Iris dataset
iris = load_iris()
X = iris.data

# Create a pandas DataFrame from the feature matrix
df = pd.DataFrame(X, columns=iris.feature_names)

# Print statistics of every feature
print("Statistics of every feature:")
print(df.describe())


In [7]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing techniques: Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Variant of Naive Bayes classifiers
classifiers = {
    'GaussianNB': GaussianNB(),
#     'MultinomialNB': MultinomialNB(),
#     'ComplementNB': ComplementNB(),
#     'BernoulliNB': BernoulliNB()
}

# Train and evaluate Naive Bayes classifiers
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    # Compute confusion matrix and other metrics
    cm = confusion_matrix(y_test, y_pred)
    print(f"Classifier: {name}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # AUC-ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic - ' + name)
    plt.legend(loc="lower right")
    plt.show()

# Feature engineering: Computing covariance matrix
cov_matrix = np.cov(X_train_scaled.T)
print("Covariance Matrix:")
print(cov_matrix)

# Print statistics of every feature
df = pd.DataFrame(X, columns=iris.feature_names)
print("Statistics of every feature:")
print(df.describe())


Classifier: GaussianNB
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



ValueError: multiclass format is not supported

In [8]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Feature engineering - example: adding polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Precision, Recall, F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Compute covariance matrix
cov_matrix = np.cov(X_train.T)
print("Covariance Matrix:\n", cov_matrix)

# Print statistics of every feature
feature_names = iris.feature_names
for i, feature_name in enumerate(feature_names):
    feature_data = X[:, i]
    print(f"Statistics for {feature_name}:")
    print(f"  - Mean: {np.mean(feature_data)}")
    print(f"  - Standard Deviation: {np.std(feature_data)}")
    print(f"  - Minimum: {np.min(feature_data)}")
    print(f"  - Maximum: {np.max(feature_data)}")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Covariance Matrix:
 [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  6.78654762e-01 -3.95616246e-02  1.24462745e+00
   4.96708683e-01  8.06219867e+00  1.88069622e+00  9.91041688e+00
   3.72001324e+00 -2.94060644e-01  3.83677444e+00  1.55539426e+00
   9.09170182e+00  3.20814223e+00  1.13214580e+00]
 [ 0.00000000e+00 -3.95616246e-02  2.01711485e-01 -3.40061625e-01
  -1.24845938e-01 -4.26938515e-01  9.98839496e-01 -1.94933459e+00
  -6.90613025e-01  1.26340868e+00 -4.51765686e-01 -2.12586835e-01
  -1.90022213e+00 -5.89899580e-01 -1.76429412e-01]
 [ 0.00000000e+00  1.24462745e+00 -3.40061625e-01  3.07071148e+00
   1.26893557e+00  1.46115451e+01  2.00873277e+00  2.22916650e+01
   8.72853277e+00 -2.24463249e+00  8.76856919e