# Write a program to implement the naïve Bayesian classifier for a sample training data set stored as Breast Cancer Dataset .CSV file. Compute the accuracy of the classifier, considering few test data sets. Calculate the accuracy, precision, and recall for your data set.


# Import Necessary Libraries

In [None]:
import pandas as pd               
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split  
from sklearn.naive_bayes import GaussianNB            
from sklearn.metrics import accuracy_score, precision_score, recall_score  

# Loading Dataset


In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/breast-cancer-analysis/breast-cancer.csv')

# Display the first few rows of the dataset to understand its structure
print(df.head())

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
# 3. Analyzing the distribution of the target variable (assume 'diagnosis' as the target)
print("\nTarget variable distribution (Diagnosis):")
print(df['diagnosis'].value_counts())

In [None]:
df.corr

# Correlation between Numerical Features

In [None]:
# Select only numeric columns for the correlation matrix
numeric_data = df.select_dtypes(include=[np.number])

# Visualizing the correlation between numerical features
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_data.corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Print the column names to verify
print(df.columns)

In [None]:
# Generate the boxplot using the correct column name
plt.figure(figsize=(8, 6))
sns.boxplot(x='diagnosis', y='radius_mean', data=df)  # Replace 'radius_mean' with the actual column name
plt.title('Radius Mean Distribution by Diagnosis')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='diagnosis', data=df)
plt.title('Count of Diagnosis in Dataset')
plt.show()

# Training and Testing of Dataset

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


X = df.drop(columns=['diagnosis'])  # Features (all columns except 'diagnosis')
y = df['diagnosis']  # Labels (target)


y = y.map({'M': 1, 'B': 0}) 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardizing features by removing mean and scaling to unit variance

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display shapes of training and testing sets
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

In [None]:
# Importing the Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

# 1. Initialize the Naive Bayes classifier
model = GaussianNB()

# 2. Train the model using the training data
model.fit(X_train, y_train)

# 3. Make predictions on the test set
y_pred = model.predict(X_test)

# 4. Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# 5. Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Confusion metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

# 1. Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# 2. Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# 3. Calculate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign (0)', 'Malignant (1)'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
